Example #1
0
    def __init__(self,
                 corpus,
                 term_file,
                 feat_file,
                 num_feats,
                 start_year=1997,
                 end_year=2007):
        # open input files
        term_path = pnames.tv_dir(corpus_root, corpus) + "/" + term_file
        feat_path = pnames.tv_dir(corpus_root, corpus) + "/" + feat_file
        s_term = codecs.open(term_path, encoding='utf-8')
        s_feat = codecs.open(feat_path, encoding='utf-8')

        self.d_term = {}
        self.d_feat2rank = {}
        self.d_rank2feat = {}
        self.d_term_year_feat2freq = defaultdict(int)
        self.d_term_year2disp = defaultdict(int)
        self.d_term_year2feats = defaultdict(list)

        # load the terms
        for line in s_term:
            line = line.strip("\n")
            fields = line.split("\t")
            self.d_term[fields[0]] = True

        # load the features in term cooccurrence frequency order
        rank = 1
        for line in s_feat:
            line = line.strip("\n")
            fields = line.split("\t")
            self.d_feat2rank[fields[0]] = rank
            self.d_rank2feat[rank] = fields[0]
            rank += 1

        # for each year, store freq of feature for the term (using .tf data)
        end_range = end_year + 1
        for year in range(start_year, end_range):
            tf_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "tf")
            s_tf = codecs.open(tf_file, encoding='utf-8')
            for line in s_tf:
                line = line.strip("\n")
                fields = line.split("\t")
                term = fields[0]
                feat = fields[1]
                freq = fields[2]
                if self.d_term.has_key(term):
                    #increment dispersion count for each new feature appearing with a term in a year
                    self.d_term_year2disp[tuple([term, year])] += 1
                    # todo/// add entropy here
                    if self.d_feat2rank.has_key(feat):
                        self.d_term_year_feat2freq[tuple([term, year,
                                                          feat])] = freq
                        self.d_term_year2feats[tuple([term, year
                                                      ])].append([feat, freq])

            s_tf.close()

        s_term.close()
        s_feat.close()
Example #2
0
def term_to_year1(start_year, end_year, corpus_list): 
    # value is the first year in which a term appears within any corpus in the corpus_list
    term2year1 = {}
    end_range = end_year + 1

    # write the terms and start years into .tstart file
    year_range = str(start_year) + "_" + str(end_year)
    term_start_file = pnames.tv_dir_year_file(corpus_root, "all", year_range, "tstart")
    print "[term_to_year1] term_start_file: %s" % term_start_file

    # .neo is same as .tstart_file but filtering any terms that first appear in year 1.
    # Thus this includes only neologisms appearing after year 1.
    year_range = str(start_year) + "_" + str(end_year)
    term_neo_file = pnames.tv_dir_year_file(corpus_root, "all", year_range, "neo")
    print "[term_to_year1] term_neo_file: %s" % term_neo_file

    for corpus in corpus_list:
        for year in range(start_year, end_range):
            term_file = pnames.tv_dir_year_file(corpus_root, corpus, str(year), "terms")
            print "[term_to_year1] processing term_file: %s" % term_file
            s_term_file = codecs.open(term_file, encoding='utf-8')
            for term_line in s_term_file:
                term_line = term_line.strip("\n")
                term_fields = term_line.split("\t")
                term = term_fields[0]
                # if the term is not in our table, enter it along with the current year as start year
                if not term2year1.has_key(term):
                    term2year1[term] = year

            s_term_file.close()

    # write the terms and start years into .tstart file
    year_range = str(start_year) + "_" + str(end_year)
    term_start_file = pnames.tv_dir_year_file(corpus_root, "all", year_range, "tstart")
    print "[term_to_year1] term_start_file: %s" % term_start_file
    s_term_start_file = codecs.open(term_start_file, encoding='utf-8')
    s_term_neo_file = codecs.open(term_neo_file, encoding='utf-8')
    for term in term2year1.keys():
        first_year = term2year1[term]
        s_term_start_file.write("%s\t%i\n" % (term, first_year))
        if first_year != start_year:
            # then include term as a neologism
            s_term_neo_file.write("%s\t%i\n" % (term, first_year))

    s_term_start_file.close()
    s_term_neo_file.close()
Example #3
0
    def __init__(self, corpus, year):
        year = str(year)
        # term file is used for extracting the doc frequency of terms for the year
        #term_file = corpus_root + "/" + corpus + "/data/tv/" + year + ".tf.f"
        #term_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "tf.f")
        #term_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "terms")
        term_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "terms.2")

        self.d_term2heads = defaultdict(list)
        self.d_term2mods =  defaultdict(list)
        self.d_head2terms = defaultdict(list)
        self.d_mod2terms =  defaultdict(list)
        self.d_head2count = defaultdict(int)
        self.d_head2count_2 = defaultdict(int)
        self.d_mod2count = defaultdict(int)
        self.d_mod2count_2 = defaultdict(int)
        self.term_count = 0
        self.headed_term_count = 0
        self.headed_term_count_2 = 0
        self.modified_term_count = 0
        self.modified_term_count_2 = 0
        self.d_term2freq = defaultdict(int)
        self.l_singletons = []
        self.l_head_counts = []
        self.l_mod_counts = []

        # sum of the frequencies for all terms containing the mod or head
        # use this to capture the average spread
        self.d_mod2sum_freq = defaultdict(int)
        self.d_head2sum_freq = defaultdict(int)
        self.d_mod2average_spread = defaultdict(int)
        self.d_head2average_spread = defaultdict(int)

        # list sorted by freq [[term, freq],...]
        self.l_tf = []

        # open the file and import list of terms
        s_term_file = codecs.open(term_file, encoding='utf-8')
        for term_line in s_term_file:
            term_line = term_line.strip("\n")
            term_fields = term_line.split("\t")
            term = term_fields[0]
            # freq is the number of docs the term occurred in (this year)
            freq = term_fields[1]
            freq = int(freq)
            self.d_term2freq[term] = freq
            self.term_count += 1
            self.l_tf.append([term, freq])
        s_term_file.close()

        # sort the term list by doc frequency
        self.l_tf.sort(utils.list_element_2_sort)

        self.compute_heads_mods()
Example #4
0
 def filter(self, cohort_year, ref_year, target_year, ref_min, ref_max, target_min, target_max, filter_type):
     l_matches = []
     file_qualifier = "cohort." + filter_type
     cohort_file = pnames.tv_dir_year_file(corpus_root, self.corpus, cohort_year, file_qualifier)
     s_cohort_file = codecs.open(cohort_file, "w", encoding='utf-8')
     # write parameters of the cohort as first line in file
     s_cohort_file.write("#%i\t%i\t%i\t%i\t%i\t%i\t%i\n" % (cohort_year, ref_year, target_year, ref_min, ref_max, target_min, target_max))
     for term in self.d_y2l_cohort[cohort_year]:
         rf = self.d_ty2freq[tuple([term, ref_year])]
         tf = self.d_ty2freq[tuple([term, target_year])]
         if rf >= ref_min and rf <= ref_max and tf >= target_min and tf <= target_max:
             l_matches.append([term, rf, tf])
             # save to a file as well
             s_cohort_file.write("%s\t%i\t%i\n" % (term, rf, tf))
     s_cohort_file.close()
     return(l_matches)
Example #5
0
def cohort_features(corpus, year, l_cohort, cohort_name):
    # cohort_term_feature => total freq
    # This accumulates the count of occurrences of a cohort term
    # with a feature in the given year
    d_cf2freq = defaultdict(int)
    # any_term_features => total freq
    # This accumulates the count of occurrences of any term
    # with a feature in the given year
    d_tf2freq = defaultdict(int)
    sum_cohort_feature_occurrences = 0
    sum_term_feature_occurrences = 0

    # keep a dict of all features encountered with cohort terms in the year
    d_feats = defaultdict(bool)

    # score consisting of prob(feature|cohort term) / prob(feature | term)
    d_cf_score = {}

    # cohort terms in dict form
    d_cohort = {}

    # output file for scores
    qualifier = cohort_name + ".fscores"
    score_file = pnames.tv_dir_year_file(corpus_root, corpus, year, qualifier)
    s_score_file = codecs.open(score_file, "w", encoding='utf-8')

    # store cohort list terms in a dict
    for (term, rf, tf) in l_cohort:
        d_cohort[term] = True

    year = str(year)
    tf_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "tf")
    s_tf_file = codecs.open(tf_file, encoding='utf-8')
    print "[RFreq]loading terms for year: %s" % year
    for term_line in s_tf_file:
        term_line = term_line.strip("\n")
        (term, feat, freq, prob) = term_line.split("\t")
        freq = int(freq)
        if d_cohort.has_key(term):
            # update cohort term counts
            d_cf2freq[feat] += freq
            sum_cohort_feature_occurrences += freq
            # keep track of the cohort features seen
            d_feats[feat] = True

        # update all feature counts
        sum_term_feature_occurrences += freq
        d_tf2freq[feat] += freq
        
    sum_cohort_feature_occurrences = float(sum_cohort_feature_occurrences)
    sum_term_feature_occurrences = float(sum_term_feature_occurrences)
    for feat in d_feats.keys():
        prob_fgc = d_cf2freq[feat] / sum_cohort_feature_occurrences
        prob_fgt = d_tf2freq[feat] / sum_term_feature_occurrences
        if prob_fgt == 0:
            pdb.set_trace()
        d_cf_score[feat] = prob_fgc / prob_fgt

    l_scores_sorted = d_cf_score.items()
    l_scores_sorted.sort(key=itemgetter(1), reverse=True)
    for (feat, score) in l_scores_sorted:
        s_score_file.write("%.2f\t%s\t%i\t%i\n" % (score, feat, d_cf2freq[feat], d_tf2freq[feat]))
    s_tf_file.close()
    s_score_file.close()
    print "[fan.cohort_features]Wrote scores to %s" % score_file
Example #6
0
    def __init__(self, corpus, start_year, end_year, term_subset_file=""):
        root = corpus_root
        # frequency for a term-year combination
        self.d_ty2freq = defaultdict(int)
        # number of terms in this year
        self.d_y2tcount = defaultdict(int)
        # number of new terms in this year
        self.d_y2ncount = defaultdict(int)
        # has the term been seen in any year so far
        self.d_term2seen = defaultdict(bool)
        # is term new in this range (i.e., appear after the first year)
        self.d_term2new = defaultdict(bool)
        # appearance year for term
        self.d_term2y1 = defaultdict(int)
        # all new terms in a year
        self.d_y2l_cohort = defaultdict(list)
        # list of freq for the term starting with first appearance year
        self.d_term2l_history = defaultdict(list)
        self.corpus = corpus
        self.term_subset_p = False
        if term_subset_file != "":
            self.term_subset_p = True
        
        self.d_term_subset = {}
        # If term_subset_file is not"", populate a dictionary of the subset of terms and 
        # only use terms in this dictionary in cohorts.
        if self.term_subset_p:
            term_subset_path = pnames.tv_dir(corpus_root, corpus) + "/" + term_subset_file
            s_term_subset = codecs.open(term_subset_path,  encoding='utf-8')
            for term_line in s_term_subset:
                term_line = term_line.strip("\n")
                term_fields = term_line.split("\t")
                term = term_fields[0]
                self.d_term_subset[term] = True
            s_term_subset.close()
            print "[fan.py Rfreq]Using term subset with %i terms" % len(self.d_term_subset.keys())

        for year in range(start_year, end_year + 1):
            #term_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "terms.2")
            term_file = pnames.tv_dir_year_file(corpus_root, corpus, year, "terms")
            s_term_file = codecs.open(term_file, encoding='utf-8')
            print "[RFreq]loading terms for year: %i" % year
            for term_line in s_term_file:
                term_line = term_line.strip("\n")
                term_fields = term_line.split("\t")
                term = term_fields[0]
                if self.term_subset_p == False or self.d_term_subset.has_key(term):
                    #pdb.set_trace()
                    # freq is the number of docs the term occurred in (this year)
                    freq = term_fields[1]
                    freq = int(freq)

                    ty = tuple([term, year])

                    # save the freq for the year
                    self.d_ty2freq[ty] = freq
                    self.d_y2tcount[year] += 1

                    # record the first appearance year (y1) for the term
                    if not self.d_term2seen[term]:
                        # if the term does not appear in the start year, we will call it
                        # new in this range
                        if year != start_year:
                            self.d_term2new[term] = True
                            self.d_y2ncount[year] += 1
                        self.d_term2y1[term] = year
                        self.d_y2l_cohort[year].append(term)
                        # mark term as seen
                        self.d_term2seen[term] = True
                
            print "Loaded %i terms, %i new" % (self.d_y2tcount[year], self.d_y2ncount[year]) 
            s_term_file.close()
Example #7
0
def create_json_chunks_file(index_name, type_name, corpus, start, end, docs_per_bulk_load=500, section_filter_p=True, write_to_file_p=False):
    # reading from fuse pipeline data
    # writing to local tv corpus dir
    # for years from start to end

    # we'll need the name of the pipeline step to create the directory path to 
    # the phr_feats files.
    pipeline_step = "d3_phr_feats"

    # range parameters
    start_year = int(start)
    end_year = int(end)
    start_range = start_year
    end_range = end_year + 1

    # track the time in <year>.log
    log_file = pnames.tv_dir_year_file(corpus_root, corpus, "all", "log")
    s_log = open(log_file, "w")

    log_message = "Starting create_json_chunks_file for years: " + str(start) + " " + str(end)
    time = log.log_current_time(s_log, log_message, True)
    # remember the start_time for computing total time
    start_time = time


    # we'll bulk load all the data for a single year.
    # the argument to elasticsearch bulk is a list of dictionaries
    # alternating metadata and content.  We'll build this up in l_bulk_elements
    
    # The output is a list of lists, where each list contains the meta/content elements for n files
    l_colloc_bulk_lists = []
    l_colloc_bulk_elements = []

    d_chunk2prev_Npr = defaultdict(set)
    d_chunk2prev_V = defaultdict(set)
    d_chunk2doc = defaultdict(set)

    for year in range(start_range, end_range):

        # loop through files in file_list_file for the year
        filelist_file = pnames.fuse_filelist(fuse_corpus_root, corpus, year)
        s_file_list = open(filelist_file)

        # track the number of lines output to json file
        num_lines_output = 0
        json_file = pnames.tv_dir(corpus_root, corpus) + str(year) + ".chunks.json"
        s_json = codecs.open(json_file, "w", encoding='utf-8')

        file_count = 0
        for line in s_file_list:

            # if we have reached the file limit for a single bulk api call, add the sublist to l_colloc_bulk_lists 
            # and start a new sublist
            if (file_count % docs_per_bulk_load) == 0:
                # mod will be 0 for initial time through loop, so ignore this sublist
                if l_colloc_bulk_elements != []:
                    l_colloc_bulk_lists.append(l_colloc_bulk_elements)
                    l_colloc_bulk_elements = []

            file_count += 1
            
            line = line.strip("\n")
            # get the date/filename portion of path
            l_line_fields = line.split("\t")
            # get the rest of the file path (publication_year/id.xml)
            pub_year_and_file = l_line_fields[2]
            # extract patent_id from the filename (e.g. US5787464A from 1998/020/US5787464A.xml)
            patent_id = os.path.splitext(os.path.basename(pub_year_and_file))[0]
            phr_feats_file = pnames.fuse_phr_feats_file(fuse_corpus_root, corpus, pipeline_step, year, pub_year_and_file)

            #print "[invention]opening phr_feats: %s, id: %s" % (phr_feats_file, patent_id)
            #sys.exit()

            #s_phr_feats = codecs.open(phr_feats_file, encoding='utf-8')
            # handle compressed or uncompressed files
            s_phr_feats = open_input_file(phr_feats_file)

            # we need to combine all the chunks from a single sentence into one output entry
            l_chunks = []
            # assume the first sent_no in a document will always be 0
            last_sent_no = "0"
            for line in s_phr_feats:
                # todo make into regex ///
                if not(section_filter_p) or line.find("TITLE") > 0 or line.find("ABSTRACT") > 0 or line.find("SUMMARY") > 0:
                    # then process the line
                    l_data = line.split("\t")
                    # save chunk as phrase with "_" instead of blank connecting tokens
                    chunk = l_data[2].replace(" ", "_")
                    # extract the value field from the doc_loc feature to get the sent_no
                    sent_no = p_doc_loc.search(line).group(1)

                    # populate chunk dictionaries
                    d_chunk2docs[chunk].add(patent_id)
                    prev_V = p_prev_V.search(line)
                    if prev_V != None:
                        d_chunk2prev_V[chunk].add(prev_V)
                    prev_Npr = p_prev_Npr.search(line)
                    if prev_Npr != None:
                        d_chunk2prev_Npr[chunk].add(prev_Npr)

                    if sent_no == last_sent_no:
                        l_chunks.append(chunk)
                    else:
                        # we are done with the sentence, so write out the chunk list
                        json_string = format_colloc_chunks2json(patent_id, year, last_sent_no, l_chunks)
                        uid = "_".join([patent_id, last_sent_no])
                        
                        #print "last_sent_no: %s, chunks: %s, json: %s" % (last_sent_no, l_chunks, json_string)
                        # note the above print gives an error for non-asci chars.
                        if write_to_file_p:
                            # make a json file with all the data to be loaded into elasticsearch
                            s_json.write("%s\n" % json_string)
                        l_colloc_bulk_elements.append(format_d_action(index_name, type_name, uid))
                        l_colloc_bulk_elements.append(format_colloc_d_content(patent_id, year, last_sent_no, l_chunks))

                        # keep the current chunk
                        l_chunks = [chunk]
                        last_sent_no = sent_no
                        num_lines_output += 1

            # output the last line
            
            json_string = format_colloc_chunks2json(patent_id, year, last_sent_no, l_chunks)
            #print "last_sent_no: %s, chunks: %s, json: %s" % (last_sent_no, l_chunks, json_string)
            s_json.write("%s\n" % json_string)
            l_colloc_bulk_elements.append(format_d_action(index_name, type_name, uid))
            l_colloc_bulk_elements.append(format_colloc_d_content(patent_id, year, last_sent_no, l_chunks))
            num_lines_output += 1

            #"""
            # stop after n files for debugging
            if file_count > 3000:
                break
            #"""

            s_phr_feats.close()            

        # add the remaining elements to l_colloc_bulk_lists
        l_colloc_bulk_lists.append(l_colloc_bulk_elements)

        print "[docs.py]%i lines from %i files written to %s" % (num_lines_output, file_count, json_file)
        s_json.close()
    s_log.close()
    s_file_list.close()

    """
    # unfinished section to create chunk index
    # prepare data for chunk index
    for chunk in d_chunk2docs.keys():

        l_docs = d_chunk2docs[chunk]
        l_prev_V = d_chunk2prev_V[chunk]
        l_prev_Npr = d_chunk2prev_Npr[chunk]

    """


    # todo: eventually, return two lists
    return(l_colloc_bulk_lists)
Example #8
0
def gen_bulk_lists(index_name, type_name, domain, corpus, start, end, lines_per_bulk_load=100, section_filter_p=True, write_to_file_p=False, max_lines=0):
    # reading from fuse pipeline data
    # writing to local tv corpus dir
    # for years from start to end

    # we'll need the name of the pipeline step to create the directory path to 
    # the phr_feats files.
    pipeline_step = "d3_phr_feats"

    ###print "corpus_root: %s, corpus: %s"  % (corpus_root, str(corpus))

    # range parameters
    start_year = int(start)
    end_year = int(end)
    start_range = start_year
    end_range = end_year + 1

    # track the time in <year>.log
    log_file = pnames.tv_dir_year_file(corpus_root, corpus, "all", "log")
    s_log = open(log_file, "w")

    log_message = "[es_np.py gen_bulk_lists]Starting make_bulk_lists for years: " + str(start) + " " + str(end)
    time = log.log_current_time(s_log, log_message, True)
    # remember the start_time for computing total time
    start_time = time

    # we'll bulk load all the data for a single year.
    # the argument to elasticsearch bulk is a list of dictionaries
    # alternating metadata and content.  We'll build this up in l_bulk_elements
    
    # The output is a list of flattened paired elements, where each list contains the meta/content elements for n lines
    #l_bulk_lists = []
    l_bulk_elements = []

    for year in range(start_range, end_range):

        # loop through files in file_list_file for the year
        filelist_file = pnames.fuse_filelist(fuse_corpus_root, corpus, year)
        s_file_list = open(filelist_file)

        # track the number of lines output to json file
        num_lines_output = 0
        json_file = pnames.tv_dir(corpus_root, corpus) + str(year) + ".chunks.json"
        s_json = codecs.open(json_file, "w", encoding='utf-8')

        file_count = 0
        ###pdb.set_trace()

        for line in s_file_list:
            ###pdb.set_trace()


            file_count += 1
            
            line = line.strip("\n")
            # get the date/filename portion of path
            l_line_fields = line.split("\t")
            # get the rest of the file path (publication_year/id.xml)
            pub_year_and_file = l_line_fields[2]
            # extract patent_id from the filename (e.g. US5787464A from 1998/020/US5787464A.xml)
            patent_id = os.path.splitext(os.path.basename(pub_year_and_file))[0]

            # create a "doc" type entry to be bulk loaded.  This will be the parent of both "sent"
            # and "np" records in the index
            
            l_bulk_elements.append(format_d_action(index_name, "doc", patent_id))
            l_bulk_elements.append(format_doc_d_content(domain, year, patent_id))

            # lists to capture each sent's sheads and sterms
            sheads = []
            sterms = []
            # loc is the sentence number in the document, starting at 0
            current_sent = 0
            # Assume the initial section will be TITLE
            current_section = "TITLE"

            num_lines_output += 1

            # end creating doc index entry

            phr_feats_file = pnames.fuse_phr_feats_file(fuse_corpus_root, corpus, pipeline_step, year, pub_year_and_file)

            #print "[invention]opening phr_feats: %s, id: %s" % (phr_feats_file, patent_id)
            #sys.exit()

            #s_phr_feats = codecs.open(phr_feats_file, encoding='utf-8')
            # handle compressed or uncompressed files
            s_phr_feats = open_input_file(phr_feats_file)

            for line in s_phr_feats:

                # if we have reached the line limit for a single bulk api call, add the sublist to l_bulk_lists 
                # and start a new sublist
                if (num_lines_output % lines_per_bulk_load) == 0:
                    ###print "num_lines_output: %i" % num_lines_output
                    # mod will be 0 for initial time through loop, so ignore this sublist
                    if l_bulk_elements != []:
                        yield l_bulk_elements
                        l_bulk_elements = []

                # todo make into regex ///
                # Note that DESC was added 3/38/15, so indices created earlier do not contain that section.
                if not(section_filter_p) or line.find("TITLE") > 0 or line.find("ABSTRACT") > 0 or line.find("SUMMARY") > 0 or line.find("DESC") > 0:
                    # then process the line
                    l_data = line.split("\t")
                    # chunk is phrase with  blanks connecting tokens
                    uid = l_data[0]  # uid is doc_id + phrase number
                    phr = l_data[2]  # phrase with whitespace separating words

                    # extract the value field from the doc_loc feature to get the loc (sentence number)
                    loc = p_doc_loc.search(line).group(1)
                    # We will store it as an integer in es
                    loc = int(loc)

                    section = p_section.search(line).group(1)
                    pos = p_pos.search(line).group(1)
                    pos = pos.replace("_", " ")

                    # populate chunk dictionaries
                    prev_V = p_prev_V.search(line)
                    if prev_V != None:
                        # extract the matched string (group 0 is the entire match, while 
                        # group 1 is the first parenthesized subexpression in the pattern)
                        prev_V = prev_V.group(1)

                    prev_Npr = p_prev_Npr.search(line)
                    if prev_Npr != None:
                        prev_Npr = prev_Npr.group(1)

                    prev_J = p_prev_J.search(line)
                    if prev_J != None:
                        # extract the matched string (group 0 is the entire match, while 
                        # group 1 is the first parenthesized subexpression in the pattern)
                        prev_J = prev_J.group(1)


                    ###pdb.set_trace()
                    l_bulk_elements.append(format_d_action(index_name, "np", uid, parent_id=patent_id))
                    d_field_content = format_np_d_content(phr, prev_Npr, prev_V, prev_J, domain, year, patent_id, loc, section, pos)
                    l_bulk_elements.append(d_field_content)

                    # We will use data in d_field_content to avoid recomputing fields for sent.
                    shead = d_field_content["chead"]
                    sterm = d_field_content["cterm"]
                    # section can change whenever loc changes
                    section = d_field_content["section"]

                    # if loc != current_sent, we need to store a sent record for the current_loc
                    if loc != current_sent:
                        # store the record and start populating a new one
                        sent_id = patent_id + "_" + str(current_sent)
                        l_bulk_elements.append(format_d_action(index_name, "sent", sent_id, parent_id=patent_id))
                        l_sent_dict = format_sent_d_content(domain, year, patent_id, current_section, current_sent, sheads, sterms)
                        l_bulk_elements.append(l_sent_dict)

                        ###print "Adding sent: %s, sent_dict: %s" % (sent_id, l_sent_dict)
                        # re-initialize the sheads and sterms lists
                        sheads = [ shead ]
                        sterms = [ sterm ]
                        # increment count for "sent" output
                        num_lines_output += 1
                        # update the current_sent and section
                        current_sent = loc
                        current_section = section

                    else:
                        # we are still in the same sentence.
                        # add the latest term/head to the sent fields for current_sent
                        sheads.append(shead)
                        sterms.append(sterm)

                    # increment count for "np" output
                    num_lines_output += 1
       
                # stop after max_lines files for debugging
                ###print "num_lines_output: %i, max_lines: %i" % (num_lines_output, max_lines)
                if (max_lines != 0) and num_lines_output > max_lines: 
                    break
            # break out of file loop as well
            if (max_lines != 0) and num_lines_output > max_lines: 
                break

            # We need to store a sent record for the last sentence in last file (= current_sent)
            sent_id = patent_id + "_" + str(current_sent)
            ###print "[gen_bulk_list]last sent_id: %s, sheads: %s, sterms: %s\n" % (sent_id, sheads, sterms)
            l_bulk_elements.append(format_d_action(index_name, "sent", sent_id, parent_id=patent_id))
            l_bulk_elements.append(format_sent_d_content(domain, year, patent_id, current_section, current_sent, sheads, sterms))
            num_lines_output += 1

            s_phr_feats.close()            

        s_json.close()

    log_message = "[es_np_index.py]Completed make_bulk_lists for years: " + str(start) + " " + str(end) + ". Number of lines: " + str(num_lines_output)
    time = log.log_current_time(s_log, log_message, True)

    s_log.close()
    s_file_list.close()

    # yield the last remaining l_bulk_elements

    print "[gen_bulk_lists]%i lines from %i files written to index %s" % (num_lines_output, file_count, index_name)
    yield(l_bulk_elements)