def __init__(self, corpus, term_file, feat_file, num_feats, start_year=1997, end_year=2007): # open input files term_path = pnames.tv_dir(corpus_root, corpus) + "/" + term_file feat_path = pnames.tv_dir(corpus_root, corpus) + "/" + feat_file s_term = codecs.open(term_path, encoding='utf-8') s_feat = codecs.open(feat_path, encoding='utf-8') self.d_term = {} self.d_feat2rank = {} self.d_rank2feat = {} self.d_term_year_feat2freq = defaultdict(int) self.d_term_year2disp = defaultdict(int) self.d_term_year2feats = defaultdict(list) # load the terms for line in s_term: line = line.strip("\n") fields = line.split("\t") self.d_term[fields[0]] = True # load the features in term cooccurrence frequency order rank = 1 for line in s_feat: line = line.strip("\n") fields = line.split("\t") self.d_feat2rank[fields[0]] = rank self.d_rank2feat[rank] = fields[0] rank += 1 # for each year, store freq of feature for the term (using .tf data) end_range = end_year + 1 for year in range(start_year, end_range): tf_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "tf") s_tf = codecs.open(tf_file, encoding='utf-8') for line in s_tf: line = line.strip("\n") fields = line.split("\t") term = fields[0] feat = fields[1] freq = fields[2] if self.d_term.has_key(term): #increment dispersion count for each new feature appearing with a term in a year self.d_term_year2disp[tuple([term, year])] += 1 # todo/// add entropy here if self.d_feat2rank.has_key(feat): self.d_term_year_feat2freq[tuple([term, year, feat])] = freq self.d_term_year2feats[tuple([term, year ])].append([feat, freq]) s_tf.close() s_term.close() s_feat.close()
def term_to_year1(start_year, end_year, corpus_list): # value is the first year in which a term appears within any corpus in the corpus_list term2year1 = {} end_range = end_year + 1 # write the terms and start years into .tstart file year_range = str(start_year) + "_" + str(end_year) term_start_file = pnames.tv_dir_year_file(corpus_root, "all", year_range, "tstart") print "[term_to_year1] term_start_file: %s" % term_start_file # .neo is same as .tstart_file but filtering any terms that first appear in year 1. # Thus this includes only neologisms appearing after year 1. year_range = str(start_year) + "_" + str(end_year) term_neo_file = pnames.tv_dir_year_file(corpus_root, "all", year_range, "neo") print "[term_to_year1] term_neo_file: %s" % term_neo_file for corpus in corpus_list: for year in range(start_year, end_range): term_file = pnames.tv_dir_year_file(corpus_root, corpus, str(year), "terms") print "[term_to_year1] processing term_file: %s" % term_file s_term_file = codecs.open(term_file, encoding='utf-8') for term_line in s_term_file: term_line = term_line.strip("\n") term_fields = term_line.split("\t") term = term_fields[0] # if the term is not in our table, enter it along with the current year as start year if not term2year1.has_key(term): term2year1[term] = year s_term_file.close() # write the terms and start years into .tstart file year_range = str(start_year) + "_" + str(end_year) term_start_file = pnames.tv_dir_year_file(corpus_root, "all", year_range, "tstart") print "[term_to_year1] term_start_file: %s" % term_start_file s_term_start_file = codecs.open(term_start_file, encoding='utf-8') s_term_neo_file = codecs.open(term_neo_file, encoding='utf-8') for term in term2year1.keys(): first_year = term2year1[term] s_term_start_file.write("%s\t%i\n" % (term, first_year)) if first_year != start_year: # then include term as a neologism s_term_neo_file.write("%s\t%i\n" % (term, first_year)) s_term_start_file.close() s_term_neo_file.close()
def __init__(self, corpus, year): year = str(year) # term file is used for extracting the doc frequency of terms for the year #term_file = corpus_root + "/" + corpus + "/data/tv/" + year + ".tf.f" #term_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "tf.f") #term_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "terms") term_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "terms.2") self.d_term2heads = defaultdict(list) self.d_term2mods = defaultdict(list) self.d_head2terms = defaultdict(list) self.d_mod2terms = defaultdict(list) self.d_head2count = defaultdict(int) self.d_head2count_2 = defaultdict(int) self.d_mod2count = defaultdict(int) self.d_mod2count_2 = defaultdict(int) self.term_count = 0 self.headed_term_count = 0 self.headed_term_count_2 = 0 self.modified_term_count = 0 self.modified_term_count_2 = 0 self.d_term2freq = defaultdict(int) self.l_singletons = [] self.l_head_counts = [] self.l_mod_counts = [] # sum of the frequencies for all terms containing the mod or head # use this to capture the average spread self.d_mod2sum_freq = defaultdict(int) self.d_head2sum_freq = defaultdict(int) self.d_mod2average_spread = defaultdict(int) self.d_head2average_spread = defaultdict(int) # list sorted by freq [[term, freq],...] self.l_tf = [] # open the file and import list of terms s_term_file = codecs.open(term_file, encoding='utf-8') for term_line in s_term_file: term_line = term_line.strip("\n") term_fields = term_line.split("\t") term = term_fields[0] # freq is the number of docs the term occurred in (this year) freq = term_fields[1] freq = int(freq) self.d_term2freq[term] = freq self.term_count += 1 self.l_tf.append([term, freq]) s_term_file.close() # sort the term list by doc frequency self.l_tf.sort(utils.list_element_2_sort) self.compute_heads_mods()
def filter(self, cohort_year, ref_year, target_year, ref_min, ref_max, target_min, target_max, filter_type): l_matches = [] file_qualifier = "cohort." + filter_type cohort_file = pnames.tv_dir_year_file(corpus_root, self.corpus, cohort_year, file_qualifier) s_cohort_file = codecs.open(cohort_file, "w", encoding='utf-8') # write parameters of the cohort as first line in file s_cohort_file.write("#%i\t%i\t%i\t%i\t%i\t%i\t%i\n" % (cohort_year, ref_year, target_year, ref_min, ref_max, target_min, target_max)) for term in self.d_y2l_cohort[cohort_year]: rf = self.d_ty2freq[tuple([term, ref_year])] tf = self.d_ty2freq[tuple([term, target_year])] if rf >= ref_min and rf <= ref_max and tf >= target_min and tf <= target_max: l_matches.append([term, rf, tf]) # save to a file as well s_cohort_file.write("%s\t%i\t%i\n" % (term, rf, tf)) s_cohort_file.close() return(l_matches)
def cohort_features(corpus, year, l_cohort, cohort_name): # cohort_term_feature => total freq # This accumulates the count of occurrences of a cohort term # with a feature in the given year d_cf2freq = defaultdict(int) # any_term_features => total freq # This accumulates the count of occurrences of any term # with a feature in the given year d_tf2freq = defaultdict(int) sum_cohort_feature_occurrences = 0 sum_term_feature_occurrences = 0 # keep a dict of all features encountered with cohort terms in the year d_feats = defaultdict(bool) # score consisting of prob(feature|cohort term) / prob(feature | term) d_cf_score = {} # cohort terms in dict form d_cohort = {} # output file for scores qualifier = cohort_name + ".fscores" score_file = pnames.tv_dir_year_file(corpus_root, corpus, year, qualifier) s_score_file = codecs.open(score_file, "w", encoding='utf-8') # store cohort list terms in a dict for (term, rf, tf) in l_cohort: d_cohort[term] = True year = str(year) tf_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "tf") s_tf_file = codecs.open(tf_file, encoding='utf-8') print "[RFreq]loading terms for year: %s" % year for term_line in s_tf_file: term_line = term_line.strip("\n") (term, feat, freq, prob) = term_line.split("\t") freq = int(freq) if d_cohort.has_key(term): # update cohort term counts d_cf2freq[feat] += freq sum_cohort_feature_occurrences += freq # keep track of the cohort features seen d_feats[feat] = True # update all feature counts sum_term_feature_occurrences += freq d_tf2freq[feat] += freq sum_cohort_feature_occurrences = float(sum_cohort_feature_occurrences) sum_term_feature_occurrences = float(sum_term_feature_occurrences) for feat in d_feats.keys(): prob_fgc = d_cf2freq[feat] / sum_cohort_feature_occurrences prob_fgt = d_tf2freq[feat] / sum_term_feature_occurrences if prob_fgt == 0: pdb.set_trace() d_cf_score[feat] = prob_fgc / prob_fgt l_scores_sorted = d_cf_score.items() l_scores_sorted.sort(key=itemgetter(1), reverse=True) for (feat, score) in l_scores_sorted: s_score_file.write("%.2f\t%s\t%i\t%i\n" % (score, feat, d_cf2freq[feat], d_tf2freq[feat])) s_tf_file.close() s_score_file.close() print "[fan.cohort_features]Wrote scores to %s" % score_file
def __init__(self, corpus, start_year, end_year, term_subset_file=""): root = corpus_root # frequency for a term-year combination self.d_ty2freq = defaultdict(int) # number of terms in this year self.d_y2tcount = defaultdict(int) # number of new terms in this year self.d_y2ncount = defaultdict(int) # has the term been seen in any year so far self.d_term2seen = defaultdict(bool) # is term new in this range (i.e., appear after the first year) self.d_term2new = defaultdict(bool) # appearance year for term self.d_term2y1 = defaultdict(int) # all new terms in a year self.d_y2l_cohort = defaultdict(list) # list of freq for the term starting with first appearance year self.d_term2l_history = defaultdict(list) self.corpus = corpus self.term_subset_p = False if term_subset_file != "": self.term_subset_p = True self.d_term_subset = {} # If term_subset_file is not"", populate a dictionary of the subset of terms and # only use terms in this dictionary in cohorts. if self.term_subset_p: term_subset_path = pnames.tv_dir(corpus_root, corpus) + "/" + term_subset_file s_term_subset = codecs.open(term_subset_path, encoding='utf-8') for term_line in s_term_subset: term_line = term_line.strip("\n") term_fields = term_line.split("\t") term = term_fields[0] self.d_term_subset[term] = True s_term_subset.close() print "[fan.py Rfreq]Using term subset with %i terms" % len(self.d_term_subset.keys()) for year in range(start_year, end_year + 1): #term_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "terms.2") term_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "terms") s_term_file = codecs.open(term_file, encoding='utf-8') print "[RFreq]loading terms for year: %i" % year for term_line in s_term_file: term_line = term_line.strip("\n") term_fields = term_line.split("\t") term = term_fields[0] if self.term_subset_p == False or self.d_term_subset.has_key(term): #pdb.set_trace() # freq is the number of docs the term occurred in (this year) freq = term_fields[1] freq = int(freq) ty = tuple([term, year]) # save the freq for the year self.d_ty2freq[ty] = freq self.d_y2tcount[year] += 1 # record the first appearance year (y1) for the term if not self.d_term2seen[term]: # if the term does not appear in the start year, we will call it # new in this range if year != start_year: self.d_term2new[term] = True self.d_y2ncount[year] += 1 self.d_term2y1[term] = year self.d_y2l_cohort[year].append(term) # mark term as seen self.d_term2seen[term] = True print "Loaded %i terms, %i new" % (self.d_y2tcount[year], self.d_y2ncount[year]) s_term_file.close()
def create_json_chunks_file(index_name, type_name, corpus, start, end, docs_per_bulk_load=500, section_filter_p=True, write_to_file_p=False): # reading from fuse pipeline data # writing to local tv corpus dir # for years from start to end # we'll need the name of the pipeline step to create the directory path to # the phr_feats files. pipeline_step = "d3_phr_feats" # range parameters start_year = int(start) end_year = int(end) start_range = start_year end_range = end_year + 1 # track the time in <year>.log log_file = pnames.tv_dir_year_file(corpus_root, corpus, "all", "log") s_log = open(log_file, "w") log_message = "Starting create_json_chunks_file for years: " + str(start) + " " + str(end) time = log.log_current_time(s_log, log_message, True) # remember the start_time for computing total time start_time = time # we'll bulk load all the data for a single year. # the argument to elasticsearch bulk is a list of dictionaries # alternating metadata and content. We'll build this up in l_bulk_elements # The output is a list of lists, where each list contains the meta/content elements for n files l_colloc_bulk_lists = [] l_colloc_bulk_elements = [] d_chunk2prev_Npr = defaultdict(set) d_chunk2prev_V = defaultdict(set) d_chunk2doc = defaultdict(set) for year in range(start_range, end_range): # loop through files in file_list_file for the year filelist_file = pnames.fuse_filelist(fuse_corpus_root, corpus, year) s_file_list = open(filelist_file) # track the number of lines output to json file num_lines_output = 0 json_file = pnames.tv_dir(corpus_root, corpus) + str(year) + ".chunks.json" s_json = codecs.open(json_file, "w", encoding='utf-8') file_count = 0 for line in s_file_list: # if we have reached the file limit for a single bulk api call, add the sublist to l_colloc_bulk_lists # and start a new sublist if (file_count % docs_per_bulk_load) == 0: # mod will be 0 for initial time through loop, so ignore this sublist if l_colloc_bulk_elements != []: l_colloc_bulk_lists.append(l_colloc_bulk_elements) l_colloc_bulk_elements = [] file_count += 1 line = line.strip("\n") # get the date/filename portion of path l_line_fields = line.split("\t") # get the rest of the file path (publication_year/id.xml) pub_year_and_file = l_line_fields[2] # extract patent_id from the filename (e.g. US5787464A from 1998/020/US5787464A.xml) patent_id = os.path.splitext(os.path.basename(pub_year_and_file))[0] phr_feats_file = pnames.fuse_phr_feats_file(fuse_corpus_root, corpus, pipeline_step, year, pub_year_and_file) #print "[invention]opening phr_feats: %s, id: %s" % (phr_feats_file, patent_id) #sys.exit() #s_phr_feats = codecs.open(phr_feats_file, encoding='utf-8') # handle compressed or uncompressed files s_phr_feats = open_input_file(phr_feats_file) # we need to combine all the chunks from a single sentence into one output entry l_chunks = [] # assume the first sent_no in a document will always be 0 last_sent_no = "0" for line in s_phr_feats: # todo make into regex /// if not(section_filter_p) or line.find("TITLE") > 0 or line.find("ABSTRACT") > 0 or line.find("SUMMARY") > 0: # then process the line l_data = line.split("\t") # save chunk as phrase with "_" instead of blank connecting tokens chunk = l_data[2].replace(" ", "_") # extract the value field from the doc_loc feature to get the sent_no sent_no = p_doc_loc.search(line).group(1) # populate chunk dictionaries d_chunk2docs[chunk].add(patent_id) prev_V = p_prev_V.search(line) if prev_V != None: d_chunk2prev_V[chunk].add(prev_V) prev_Npr = p_prev_Npr.search(line) if prev_Npr != None: d_chunk2prev_Npr[chunk].add(prev_Npr) if sent_no == last_sent_no: l_chunks.append(chunk) else: # we are done with the sentence, so write out the chunk list json_string = format_colloc_chunks2json(patent_id, year, last_sent_no, l_chunks) uid = "_".join([patent_id, last_sent_no]) #print "last_sent_no: %s, chunks: %s, json: %s" % (last_sent_no, l_chunks, json_string) # note the above print gives an error for non-asci chars. if write_to_file_p: # make a json file with all the data to be loaded into elasticsearch s_json.write("%s\n" % json_string) l_colloc_bulk_elements.append(format_d_action(index_name, type_name, uid)) l_colloc_bulk_elements.append(format_colloc_d_content(patent_id, year, last_sent_no, l_chunks)) # keep the current chunk l_chunks = [chunk] last_sent_no = sent_no num_lines_output += 1 # output the last line json_string = format_colloc_chunks2json(patent_id, year, last_sent_no, l_chunks) #print "last_sent_no: %s, chunks: %s, json: %s" % (last_sent_no, l_chunks, json_string) s_json.write("%s\n" % json_string) l_colloc_bulk_elements.append(format_d_action(index_name, type_name, uid)) l_colloc_bulk_elements.append(format_colloc_d_content(patent_id, year, last_sent_no, l_chunks)) num_lines_output += 1 #""" # stop after n files for debugging if file_count > 3000: break #""" s_phr_feats.close() # add the remaining elements to l_colloc_bulk_lists l_colloc_bulk_lists.append(l_colloc_bulk_elements) print "[docs.py]%i lines from %i files written to %s" % (num_lines_output, file_count, json_file) s_json.close() s_log.close() s_file_list.close() """ # unfinished section to create chunk index # prepare data for chunk index for chunk in d_chunk2docs.keys(): l_docs = d_chunk2docs[chunk] l_prev_V = d_chunk2prev_V[chunk] l_prev_Npr = d_chunk2prev_Npr[chunk] """ # todo: eventually, return two lists return(l_colloc_bulk_lists)
def gen_bulk_lists(index_name, type_name, domain, corpus, start, end, lines_per_bulk_load=100, section_filter_p=True, write_to_file_p=False, max_lines=0): # reading from fuse pipeline data # writing to local tv corpus dir # for years from start to end # we'll need the name of the pipeline step to create the directory path to # the phr_feats files. pipeline_step = "d3_phr_feats" ###print "corpus_root: %s, corpus: %s" % (corpus_root, str(corpus)) # range parameters start_year = int(start) end_year = int(end) start_range = start_year end_range = end_year + 1 # track the time in <year>.log log_file = pnames.tv_dir_year_file(corpus_root, corpus, "all", "log") s_log = open(log_file, "w") log_message = "[es_np.py gen_bulk_lists]Starting make_bulk_lists for years: " + str(start) + " " + str(end) time = log.log_current_time(s_log, log_message, True) # remember the start_time for computing total time start_time = time # we'll bulk load all the data for a single year. # the argument to elasticsearch bulk is a list of dictionaries # alternating metadata and content. We'll build this up in l_bulk_elements # The output is a list of flattened paired elements, where each list contains the meta/content elements for n lines #l_bulk_lists = [] l_bulk_elements = [] for year in range(start_range, end_range): # loop through files in file_list_file for the year filelist_file = pnames.fuse_filelist(fuse_corpus_root, corpus, year) s_file_list = open(filelist_file) # track the number of lines output to json file num_lines_output = 0 json_file = pnames.tv_dir(corpus_root, corpus) + str(year) + ".chunks.json" s_json = codecs.open(json_file, "w", encoding='utf-8') file_count = 0 ###pdb.set_trace() for line in s_file_list: ###pdb.set_trace() file_count += 1 line = line.strip("\n") # get the date/filename portion of path l_line_fields = line.split("\t") # get the rest of the file path (publication_year/id.xml) pub_year_and_file = l_line_fields[2] # extract patent_id from the filename (e.g. US5787464A from 1998/020/US5787464A.xml) patent_id = os.path.splitext(os.path.basename(pub_year_and_file))[0] # create a "doc" type entry to be bulk loaded. This will be the parent of both "sent" # and "np" records in the index l_bulk_elements.append(format_d_action(index_name, "doc", patent_id)) l_bulk_elements.append(format_doc_d_content(domain, year, patent_id)) # lists to capture each sent's sheads and sterms sheads = [] sterms = [] # loc is the sentence number in the document, starting at 0 current_sent = 0 # Assume the initial section will be TITLE current_section = "TITLE" num_lines_output += 1 # end creating doc index entry phr_feats_file = pnames.fuse_phr_feats_file(fuse_corpus_root, corpus, pipeline_step, year, pub_year_and_file) #print "[invention]opening phr_feats: %s, id: %s" % (phr_feats_file, patent_id) #sys.exit() #s_phr_feats = codecs.open(phr_feats_file, encoding='utf-8') # handle compressed or uncompressed files s_phr_feats = open_input_file(phr_feats_file) for line in s_phr_feats: # if we have reached the line limit for a single bulk api call, add the sublist to l_bulk_lists # and start a new sublist if (num_lines_output % lines_per_bulk_load) == 0: ###print "num_lines_output: %i" % num_lines_output # mod will be 0 for initial time through loop, so ignore this sublist if l_bulk_elements != []: yield l_bulk_elements l_bulk_elements = [] # todo make into regex /// # Note that DESC was added 3/38/15, so indices created earlier do not contain that section. if not(section_filter_p) or line.find("TITLE") > 0 or line.find("ABSTRACT") > 0 or line.find("SUMMARY") > 0 or line.find("DESC") > 0: # then process the line l_data = line.split("\t") # chunk is phrase with blanks connecting tokens uid = l_data[0] # uid is doc_id + phrase number phr = l_data[2] # phrase with whitespace separating words # extract the value field from the doc_loc feature to get the loc (sentence number) loc = p_doc_loc.search(line).group(1) # We will store it as an integer in es loc = int(loc) section = p_section.search(line).group(1) pos = p_pos.search(line).group(1) pos = pos.replace("_", " ") # populate chunk dictionaries prev_V = p_prev_V.search(line) if prev_V != None: # extract the matched string (group 0 is the entire match, while # group 1 is the first parenthesized subexpression in the pattern) prev_V = prev_V.group(1) prev_Npr = p_prev_Npr.search(line) if prev_Npr != None: prev_Npr = prev_Npr.group(1) prev_J = p_prev_J.search(line) if prev_J != None: # extract the matched string (group 0 is the entire match, while # group 1 is the first parenthesized subexpression in the pattern) prev_J = prev_J.group(1) ###pdb.set_trace() l_bulk_elements.append(format_d_action(index_name, "np", uid, parent_id=patent_id)) d_field_content = format_np_d_content(phr, prev_Npr, prev_V, prev_J, domain, year, patent_id, loc, section, pos) l_bulk_elements.append(d_field_content) # We will use data in d_field_content to avoid recomputing fields for sent. shead = d_field_content["chead"] sterm = d_field_content["cterm"] # section can change whenever loc changes section = d_field_content["section"] # if loc != current_sent, we need to store a sent record for the current_loc if loc != current_sent: # store the record and start populating a new one sent_id = patent_id + "_" + str(current_sent) l_bulk_elements.append(format_d_action(index_name, "sent", sent_id, parent_id=patent_id)) l_sent_dict = format_sent_d_content(domain, year, patent_id, current_section, current_sent, sheads, sterms) l_bulk_elements.append(l_sent_dict) ###print "Adding sent: %s, sent_dict: %s" % (sent_id, l_sent_dict) # re-initialize the sheads and sterms lists sheads = [ shead ] sterms = [ sterm ] # increment count for "sent" output num_lines_output += 1 # update the current_sent and section current_sent = loc current_section = section else: # we are still in the same sentence. # add the latest term/head to the sent fields for current_sent sheads.append(shead) sterms.append(sterm) # increment count for "np" output num_lines_output += 1 # stop after max_lines files for debugging ###print "num_lines_output: %i, max_lines: %i" % (num_lines_output, max_lines) if (max_lines != 0) and num_lines_output > max_lines: break # break out of file loop as well if (max_lines != 0) and num_lines_output > max_lines: break # We need to store a sent record for the last sentence in last file (= current_sent) sent_id = patent_id + "_" + str(current_sent) ###print "[gen_bulk_list]last sent_id: %s, sheads: %s, sterms: %s\n" % (sent_id, sheads, sterms) l_bulk_elements.append(format_d_action(index_name, "sent", sent_id, parent_id=patent_id)) l_bulk_elements.append(format_sent_d_content(domain, year, patent_id, current_section, current_sent, sheads, sterms)) num_lines_output += 1 s_phr_feats.close() s_json.close() log_message = "[es_np_index.py]Completed make_bulk_lists for years: " + str(start) + " " + str(end) + ". Number of lines: " + str(num_lines_output) time = log.log_current_time(s_log, log_message, True) s_log.close() s_file_list.close() # yield the last remaining l_bulk_elements print "[gen_bulk_lists]%i lines from %i files written to index %s" % (num_lines_output, file_count, index_name) yield(l_bulk_elements)