Ejemplo n.º 1
0
def prepare_a_tf(corpus_root, corpus, year, mallet_act_results):

    tf_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", "", "")
    # subset and cat_type are fixed as "a" and "pn"
    a_tf_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", "a", "pn")
    res_lines = open(mallet_act_results, 'r').readlines()
    tf_lines = open(tf_file, 'r').readlines()
    s_a_tf = open(a_tf_file, 'w')

    attDict = {}

    print "building attributes dict..."
    for i in range(0, len(res_lines)):
        fields = res_lines[i].split()
        tuples = [('c',float(fields[2])), ('t',float(fields[4])), ('a',float(fields[6]))]
        num_tuples = [e[1] for e in tuples]
        max_list = [e[0] for e in tuples if e[1] == max(num_tuples)]
        if 'a' in max_list:
            attDict[fields[0]] = 1            
    count = 0
    #sortedKeys = sorted(attDict.keys())
    for line in tf_lines:

        count+=1
        if count % 100000 == 0:
            print count
        term = '_'.join(line.split('\t')[0].split())
        if term in attDict:
            s_a_tf.write(line)

    s_a_tf.close()
Ejemplo n.º 2
0
def run_classify(corpus, year, cat_type, subset=""):

    # corpus_root = "/home/j/anick/patent-classifier/ontology/creation/data/patents/"
    #tv_loc = "/data/tv/"
    outfile_qualifier = "cat"
    priors_qualifier = "cat_prob"
    terms_qualifier = "tf"
    term2freq_qualifier = "terms"
    lfgc_qualifier = "fc_kl"

    ################ variable parts of path
    outfile_year = str(year)
    year_cat_name = outfile_year + "." + cat_type
    #corpus = "ln-us-cs-500k"
    #corpus = "ln-us-12-chemical"
    ################

    #print "[run_classify]Output dir: %s" % tv_loc

    #path_to_terms_file = outroot + corpus + tv_loc + outfile_year + "."
    #path_to_terms_file =  pnames.tv_filepath(corpus_root, corpus, year, "tf", subset, "")
    #path_to_file = outroot + corpus + tv_loc + year_cat_name + "."
    #priors_file = path_to_file + priors_qualifier
    priors_file = pnames.tv_filepath(corpus_root, corpus, year,
                                     priors_qualifier, subset, cat_type)
    #terms_file = path_to_terms_file + terms_qualifier
    terms_file = pnames.tv_filepath(corpus_root, corpus, year, terms_qualifier,
                                    subset, "")

    #lfgc_file = path_to_file + lfgc_qualifier
    lfgc_file = pnames.tv_filepath(corpus_root, corpus, year, lfgc_qualifier,
                                   subset, cat_type)

    #term2freq_file = path_to_terms_file + term2freq_qualifier
    term2freq_file = pnames.tv_filepath(corpus_root, corpus, year,
                                        term2freq_qualifier, "", "")

    # compute l_cats, l_priors, d_lfgc, d_term2feats once and use them to run several thresholds
    print "[nbayes.py]priors_file: %s" % priors_file
    (l_cats, l_priors) = populate_priors(priors_file)

    print "[nbayes.py]lfgc_file: %s" % lfgc_file
    d_lfgc = populate_lfgc(lfgc_file)

    print "[nbayes.py]terms_file: %s" % terms_file
    d_term2feats = populate_terms(terms_file)

    print "[nbayes.py]term2freq_file: %s" % term2freq_file
    d_term2freq = populate_term2freq(term2freq_file)

    # min_weight = .2
    #for min_weight in [.1, .2]:
    for cutoff in [.1, .05, .0]:
        cutoff_qualifier = role.cat_cutoff_file_type(cutoff)
        #outfile = path_to_file + outfile_qualifier + ".w" + cutoff_qualifier
        outfile = pnames.tv_filepath(corpus_root, corpus, year,
                                     cutoff_qualifier, subset, cat_type)
        print "[nbayes.py]classifying into outfile: %s" % outfile
        classify(l_cats, l_priors, d_lfgc, d_term2feats, d_term2freq, cutoff,
                 outfile)
Ejemplo n.º 3
0
def run_domain_score(corpus1, corpus1_size, corpus2, corpus2_size, year):
    # corpus_root = "/home/j/anick/patent-classifier/ontology/creation/data/patents/"
    #outfile_name = corpus1 + "_" + corpus2 + ".ds"
    outfile = pnames.tv_filepath(corpus_root, corpus1, year, "ds", "", "")
    f_terms1 = pnames.tv_filepath(corpus_root, corpus1, year, "terms", "", "")
    f_terms2 = pnames.tv_filepath(corpus_root, corpus2, year, "terms", "", "")

    domain_score(f_terms1, corpus1_size, f_terms2, corpus2_size, outfile)
Ejemplo n.º 4
0
def cat_filter(corpus_root, corpus, year, cat_type, subset, min_freq,
               min_domain_score, max_freq):
    cat_file_type = "cat.w0.0"
    f_cat = pnames.tv_filepath(corpus_root, corpus, year, cat_file_type,
                               subset, cat_type)
    f_ds = pnames.tv_filepath(corpus_root, corpus, year, "ds", "", "")
    out_file_type = cat_file_type + "_r" + str(min_freq) + "-" + str(
        max_freq) + "_ds" + str(min_domain_score)
    f_out = pnames.tv_filepath(corpus_root, corpus, year, out_file_type,
                               subset, cat_type)

    d_term2cat = {}
    d_term2ds = {}

    s_cat = codecs.open(f_cat, encoding='utf-8')
    s_ds = codecs.open(f_ds, encoding='utf-8')
    s_out = codecs.open(f_out, "w", encoding='utf-8')

    # store domain_scores
    for line in s_ds:
        line = line.strip()
        #proximal zone   5       1       1.841114
        (term, freq, generic_freq, domain_score) = line.split("\t")
        d_term2ds[term] = float(domain_score)

    # categorized terms
    for line in s_cat:
        line = line.strip()
        l_fields = line.split("\t")
        term = l_fields[0]
        cat = l_fields[3]
        try:
            freq = int(l_fields[4])
        except:
            print "[cat_filter]In line: %s" % line
            print "[cat_filter]Illegal integer in field 4: [%s][%s][%s][%s][%s][%s]" % (
                l_fields[0], l_fields[1], l_fields[2], l_fields[3],
                l_fields[4], l_fields[5])
            quit
        ds = d_term2ds[term]
        # filter and output
        if ds >= min_domain_score and (freq >= min_freq and freq <= max_freq):
            s_out.write("%s\t%s\t%i\t%f\n" % (term, cat, freq, ds))

    s_cat.close()
    s_ds.close()
    s_out.close()
Ejemplo n.º 5
0
def run_steps(corpus,
              year,
              todo_list=["nb", "ds", "cf"],
              ranges=[[10, 100000, 1.5], [2, 10, 1.5]],
              cat_type="act",
              subset=""):
    #parameters
    code_root = roles_config.CODE_ROOT
    # path to corpus
    # corpus_root = code_root + "data/patents/"
    corpus1_size_file = pnames.tv_filepath(corpus_root, corpus, year, "cs", "",
                                           "")
    # generic corpus for domain specificity computation
    corpus2 = "ln-us-all-600k"
    corpus2_size_file = pnames.tv_filepath(corpus_root, corpus2, year, "cs",
                                           "", "")

    # read in the corpus sizes
    with open(corpus1_size_file, 'r') as f:
        corpus1_size = int(f.readline().strip("\n"))

    with open(corpus2_size_file, 'r') as f:
        corpus2_size = int(f.readline().strip("\n"))

    if "nb" in todo_list:
        # from .fc_kl, create act.cat.w0.0
        print "[run_steps]step nb, Creating .cat.w0.0"
        run_classify(corpus, year, cat_type, subset)
    if "ds" in todo_list:
        # from , create .ds
        print "[run_steps]step ds, Creating .cat.w0.0_gt10_ds2"
        run_domain_score(corpus, corpus1_size, corpus2, corpus2_size, year)
    if "cf" in todo_list:

        # run cat_filter for each range
        for (min_freq, max_freq, min_domain_score) in ranges:

            # from .ds and act.cat.w0.0, create .cat.w0.0_gt5_ds2
            print "[run_steps]step cf, Creating .act.cat.w0.0_gt?_ds?"
            #min_freq = 5
            #min_domain_score = 2
            run_cat_filter(corpus, year, min_freq, min_domain_score, max_freq,
                           cat_type, subset)

    print "[run_steps]Reached end of todo_list"
Ejemplo n.º 6
0
def prepare_classify(corpus_root, corpus, year, cat_type, subset):

    #tf_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", subset, cat_type)
    # try making cat_type empty for tf file
    tf_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", subset, "")
    print "[prepare_classify]Preparing to open the .tf file: %s" % tf_file
    tf_lines = open(tf_file).readlines()
    print "[prepare_classify]Finished uploading .tf file!"
   

    termDict = {}

    print "Creating term dict..."

    count1 = 0

    for line in tf_lines:
        if count1 % 100000 == 0:
            print count1
        count1 += 1
        fields = line.split('\t')
        term = '_'.join(fields[0].split())
        feature = fields[1]
        count = fields[2]
        fc = feature+":"+count
        if term in termDict:
            termDict[term].append(fc)
        else:
            termDict[term] = [fc]
    
    class_input_file = pnames.tv_filepath(corpus_root, corpus, year, "unlab", subset, cat_type)
    print "class_input_file: %s" % class_input_file
    class_input = open(class_input_file, 'w')

    print "Writing into file..."
    print "Len of dict is"+ str(len(termDict))

    for term in termDict:
        features = ' '.join(termDict[term])
        class_input.write(term+'\t'+features+'\n')

    class_input.close()
Ejemplo n.º 7
0
def run_diff_score(corpus, year1, year2):
    # corpus_root = "/home/j/anick/patent-classifier/ontology/creation/data/patents/"
    outfile_years = str(year1) + "_" + str(year2)
    outfile = pnames.tv_filepath(corpus_root, corpus, outfile_years, "diff",
                                 "", "")
    f_terms1 = pnames.tv_filepath(corpus_root, corpus, year1, "terms", "", "")
    f_terms2 = pnames.tv_filepath(corpus_root, corpus, year2, "terms", "", "")
    cat_file = pnames.tv_filepath(corpus_root, corpus, year1, "cat.w0.0", "",
                                  "act")
    f_ds1 = pnames.tv_filepath(corpus_root, corpus, year1, "ds", "", "")

    # read in the corpus sizes
    y1_size_file = pnames.tv_filepath(corpus_root, corpus, year1, "cs", "", "")
    y2_size_file = pnames.tv_filepath(corpus_root, corpus, year2, "cs", "", "")
    y1_size = 0
    y2_size = 0
    with open(y1_size_file, 'r') as f:
        y1_size = int(f.readline().strip("\n"))

    with open(y2_size_file, 'r') as f:
        y2_size = int(f.readline().strip("\n"))

    diff_score(f_terms1, y1_size, f_terms2, y2_size, f_ds1, cat_file, outfile)
Ejemplo n.º 8
0
def prepare_train(corpus_root, corpus, year, cat_type, subset):
    tf_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", subset, "")
    seed_file= pnames.tv_filepath(corpus_root, corpus, year, "tcs", subset, cat_type)
    
    #s_tf = open(tf_file)
    tf_lines = open(tf_file, 'r').readlines()
    print "[prepare_train]opening .tf file: %s " % tf_file
    s_tcs = open(seed_file)
    print "[prepare_train]opening .tcs (seed)file: %s" % seed_file

    termDict = {}

    for line in s_tcs:
        fields = line.split('\t')
        # replace spaces with "_" in terms since mallet parser uses spaces as separators
        term = '_'.join(fields[0].split())
        ##print term
        # A term has a label and a list of features
        termDict[term] = [fields[1], []]

    print "[prepare_train]Done creating term dictionary"
    print "[prepare_train]Building feature dictionary..."


    count1 = 0
    sortedKeys = sorted(termDict.keys())
    
    for line in tf_lines:
        if count1 % 100000 == 0:
            print count1
        count1 = count1 + 1
        fields = line.split('\t')
        term = '_'.join(fields[0].split())
        ##print term
        feat_val = (fields[1], fields[2])
        if termDict.has_key(term):
            termDict[term][1].append(feat_val)

    print "Finished building feature dictionary!"
    print "Writing to file..."
    #s_tf.close()
    s_tcs.close()
    
    mallet_in_file = pnames.tv_filepath(corpus_root, corpus, year, "train", subset, cat_type)
    s_mallet_in = open(mallet_in_file, 'w')
    
    for term in termDict.keys():
        s_mallet_in.write(term+'\t'+termDict[term][0]+'\t')
        if len(termDict[term][1]) == 0:
            print "No features to term! Oh no!!!!"
        for f_v in termDict[term][1]:
            s_mallet_in.write(f_v[0]+":"+f_v[1])
            s_mallet_in.write(" ")
        s_mallet_in.write("\n")

    print "Created mallet_in file in directory!"

    # create mallet vectors file from .train data
    #/home/j/corpuswork/fuse/code/patent-classifier/tools/mallet/mallet-2.0.7/bin/csv2vectors --input myInput.train --output myInput.vectors

    # create classifier from .vectors
    # /home/j/corpuswork/fuse/code/patent-classifier/tools/mallet/mallet-2.0.7/bin/vectors2classify --input myInput.vectors --training-portion 0.9 --trainer NaiveBayes --output-classifier <file>.NBclassifier > <file>.mallet_stats

    s_mallet_in.close()
Ejemplo n.º 9
0
def feat_probs(corpus, cohort_year, prob_year, feature_year):
    # create file names
    cohort_file = pnames.tv_filepath(corpus_root, corpus, cohort_year,
                                     "cohort.filt.gold", "", "")
    tf_file = pnames.tv_filepath(corpus_root, corpus, prob_year, "tf", "", "")
    feats_file = pnames.tv_filepath(corpus_root, corpus, feature_year,
                                    "feats.1000", "", "")

    year_offset = prob_year - cohort_year
    offset_probs_str = str(year_offset) + ".probs"
    fgt_file = pnames.tv_filepath(corpus_root, corpus, cohort_year,
                                  offset_probs_str, "", "")
    print "[cohort.py feat_probs]]Writing to: %s" % fgt_file

    s_cohort_file = codecs.open(cohort_file, encoding='utf-8')
    s_tf_file = codecs.open(tf_file, encoding='utf-8')
    s_feats_file = codecs.open(feats_file, encoding='utf-8')
    s_fgt_file = codecs.open(fgt_file, "w", encoding='utf-8')

    #dictionaries
    # sum of probs for feature given cohort term
    d_feat2sum_prob_fgct = collections.defaultdict(int)
    # sum of probs for feature given any term
    d_feat2sum_prob_fgt = collections.defaultdict(int)

    # count of number terms contributing to the sum of probs, so
    # that we can divide by the count to calculate the average.
    d_feat2_count_fgct = collections.defaultdict(int)
    d_feat2_count_fgt = collections.defaultdict(int)

    # Boolean dictionaries to keep track of sets of items
    # features of interest
    d_feats = {}
    # terms in gold cohort
    d_cohort = {}

    # terms in corpus
    d_all_terms = {}

    # import features
    for line in s_feats_file:
        line = line.strip()
        l_fields = line.split("\t")
        feat = l_fields[0]
        d_feats[feat] = True

    # import gold_cohort
    for line in s_cohort_file:
        line = line.strip()
        # first line is info about the thresholds for the cohort growth
        if line[0] != "#":
            l_fields = line.split("\t")
            term = l_fields[0]
            d_cohort[term] = True

    #pdb.set_trace()

    # import f|t probs
    for line in s_tf_file:
        line = line.strip()
        l_fields = line.split("\t")
        term = l_fields[0]
        feat = l_fields[1]

        # keep track of all terms seen to count them later
        d_all_terms[term] = True

        if d_feats.has_key(feat):
            # if this is a feature we are interested in
            prob = float(l_fields[4])
            # add its prob to the total for this feature,
            # given any term
            d_feat2sum_prob_fgt[feat] = prob + d_feat2sum_prob_fgt[feat]
            #d_feat2_count_fgt[feat] += 1

            # if the term is a cohort term, also add the prob
            # to the total for feature given cohort term
            if d_cohort.has_key(term):
                d_feat2sum_prob_fgct[feat] = prob + d_feat2sum_prob_fgct[feat]
                #d_feat2_count_fgct[feat] += 1

    #pdb.set_trace()
    # output probs
    count_all_terms = len(d_all_terms.keys())
    count_gold_terms = len(d_cohort.keys())

    print "[cohort.py] total terms in corpus: %i, in gold set: %i" % (
        count_all_terms, count_gold_terms)
    for feat in d_feats.keys():
        average_prob_fgt = float(d_feat2sum_prob_fgt[feat]) / count_all_terms
        average_prob_fgct = float(
            d_feat2sum_prob_fgct[feat]) / count_gold_terms

        diff = average_prob_fgct - average_prob_fgt
        ratio = average_prob_fgct / average_prob_fgt
        s_fgt_file.write(
            "%s\t%f\t%f\t%f\t%f\n" %
            (feat, average_prob_fgct, average_prob_fgt, diff, ratio))

    s_cohort_file.close()
    s_tf_file.close()
    s_feats_file.close()
    s_fgt_file.close()
Ejemplo n.º 10
0
def filter_tf_file(corpus_root, corpus, year, act_file_type):
    #tf_file = tv_root + str(year) + ".tf"
    tfa_subset = "a"
    tft_subset = "t"

    tf_file = pnames.tv_filepath(corpus_root,
                                 corpus,
                                 year,
                                 "tf",
                                 "",
                                 cat_type="")
    tfa_file = pnames.tv_filepath(corpus_root,
                                  corpus,
                                  year,
                                  "tf",
                                  tfa_subset,
                                  cat_type="")
    tft_file = pnames.tv_filepath(corpus_root,
                                  corpus,
                                  year,
                                  "tf",
                                  tft_subset,
                                  cat_type="")
    print "[filter_tf_file]Creating tfa_file: %s" % tfa_file
    print "[filter_tf_file]Creating tft_file: %s" % tft_file

    act_file = pnames.tv_filepath(corpus_root, corpus, year, act_file_type, "",
                                  "act")
    print "[filter_tf_file]Reading from act_file: %s" % act_file

    s_tfa = codecs.open(tfa_file, "w", encoding='utf-8')
    s_tft = codecs.open(tft_file, "w", encoding='utf-8')

    d_term2cat = defaultdict(str)

    # store the category of each term labeled a and t
    s_act_file = codecs.open(act_file, encoding='utf-8')
    for line in s_act_file:
        line = line.strip("\n")
        l_fields = line.split("\t")
        term = l_fields[0]
        cat = l_fields[3]
        d_term2cat[term] = cat
        #print "term: %s, cat: %s" % (term, cat)
    s_act_file.close()

    # create subset files of .tf for the a and t terms
    s_tf_file = codecs.open(tf_file, encoding='utf-8')
    for line in s_tf_file:
        # don't bother to strip off newline
        # just grab the term
        term = line.split("\t")[0]
        cat = d_term2cat[term]
        if cat == "a":
            s_tfa.write(line)
        elif cat == "t":
            s_tft.write(line)

    s_tf_file.close()
    s_tfa.close()
    s_tft.close()