Esempio n. 1
0
 def add_dataset(self, dataset):
     self._check_dataset(dataset)
     self.dataset = dataset
     logfile = "index.log.dataset.%s.txt" % self.dataset
     self.log = open(os.path.join(self.idx_dir, logfile), 'w')
     self.classify_dir = os.path.join(corpus, 'data', 't2_classify',
                                      dataset)
     fname = os.path.join(self.classify_dir, 'classify.MaxEnt.out.s2.y.nr')
     fh = open_input_file(fname)
     years = {}
     terms = {}
     self.log.write("$ python %s\n\n" % ' '.join(sys.argv))
     self._write_message("Collecting terms...")
     count = 0
     t1 = time.time()
     step = 100000
     for line in fh:
         count += 1
         #if count > 100000: break
         if count % step == 0:
             t2 = time.time()
             self._write_message(
                 "   loaded %s classifier lines in %.2f seconds (%sK done)"
                 % (step, t2 - t1, count / 1000))
             t1 = t2
         (id, score) = line.rstrip().split("\t")
         (year, doc, term) = id.split("|", 2)
         score = float(score)
         self._update_years_idx(year, doc, years)
         self._update_terms_idx(term, year, score, terms)
     self._write_message("Updating databases...")
     self._update_years_db(years)
     self._update_terms_db(terms)
Esempio n. 2
0
def add_phr_feats_file(phr_feats_file, s_mallet):
    """Loop through phr_feats_file and add the first 30 lines to s_mallet. Only
    add the lines if the chunk is in the title or abstract."""
    # TODO: was originally imported from run_iclassify, belongs in mallet?
    # TODO: should maybe be in separate utilities file (classifier_utils.py)
    global output_count
    num_lines_output = 0
    # handle compressed or uncompressed files
    s_phr_feats = open_input_file(phr_feats_file)
    # keep first 30 chunks, if they are from title/abstract
    num_chunks = 0
    for line in s_phr_feats:
        if num_chunks >= 30:
            break
        line = line.strip("\n")
        if line.find("TITLE") > 0 or line.find("ABSTRACT") > 0:
            l_data = line.split("\t")
            chunkid = l_data[0]
            year = l_data[1]
            phrase = l_data[2]
            l_feats = l_data[3:]
            key = make_instance_key(chunkid, year, phrase)
            # add dummy "n" as class label
            instance_line = key + " n " + " ".join(l_feats) + "\n"
            output_count += 1
            s_mallet.write(instance_line)
            num_chunks += 1
            num_lines_output += 1
    s_phr_feats.close()
    return num_lines_output
Esempio n. 3
0
def add_file_to_utraining_test_file(fname,
                                    s_test,
                                    d_phr2label,
                                    d_features,
                                    stats,
                                    use_all_chunks_p=True,
                                    default_label='n'):
    """Add document features from fname as vectors to s_test. This was factored
    out from make_utraining_test_file() so that it could be called by itself."""
    def incr(x):
        stats[x] += 1

    fh = open_input_file(fname)
    year, doc_id = get_year_and_docid(fname)
    docfeats = generate_doc_feats(fh, doc_id, year)
    for term in sorted(docfeats.keys()):
        feats = docfeats[term][2:]
        # use only the features used by the model
        if d_features:
            feats = [f for f in feats if d_features.has_key(f.split("=")[0])]
        uid = "%s|%s|%s" % (year, doc_id, term.replace(' ', '_'))
        feats = sorted(unique_list(feats))
        incr('labeled_count') if d_phr2label.has_key(term) else incr(
            'unlabeled_count')
        # include the instance if all chunks are used or if it doesn't have a label.
        if use_all_chunks_p == True or not d_phr2label.has_key(term):
            mallet_list = [uid, default_label] + feats
            # mallet line format: "uid label f1 f2 f3 ..."
            mallet_line = u" ".join(mallet_list) + u"\n"
            s_test.write(mallet_line)
            incr('total_count')
    fh.close()
def read_roles(year):
    roles = {}
    fname = os.path.join(KEYTERMS_CLASS, year,
                         'iclassify.MaxEnt.label.merged.tab')
    print fname
    c = 0
    for line in open_input_file(fname):
        c += 1
        if c % 100000 == 0: print c
        #if c > 1000: break
        (id, basename, role, term) = line.rstrip("\n\r\f").split("\t")
        roles[term][role] = roles.setdefault(term, {}).get(role, 0) + 1
    return roles
Esempio n. 5
0
def collect_terms_in_corpus(corpus):
    t1 = time.time()
    terms = {}
    done = 0
    for line in open(os.path.join(corpus, 'config', FILELIST)):
        done += 1
        if done % 100 == 0: print done
        #if done >= 100: break
        fname = line.split()[2]
        fname = os.path.join(corpus, 'data', 'd3_phr_feats', '01', 'files',
                             fname)
        for line in open_input_file(fname):
            term = line.split("\t")[2]
            terms[term] = terms.get(term, 0) + 1
    return terms
Esempio n. 6
0
 def run_matcher_on_file(self, fname, fh):
     infile = open_input_file(fname)
     for line in infile:
         (id, year, term, feats) = parse_feats_line(line)
         self.feature_statistics.add(feats)
         prev_V = feats.get('prev_V', None)
         #initial_V = feats.get('initial_V', None)
         #chunk_lead_VBG = feats.get('chunk_lead_VBG', None)
         #if prev_V is not None:
         #    fh.write("%s\t%s\t%s\t%s\n" % (year, id, term , prev_V))
         for pattern in self.patterns:
             matched_features = pattern.matches(feats)
             if matched_features is not None:
                 fh.write("%s\t%s\t%s\t%s\t%s\n" %
                          (year, id, pattern.name, term, matched_features))
Esempio n. 7
0
def count_tokens_in_corpus(corpus):
    t1 = time.time()
    file_count = 0
    sentence_count = 0
    token_count = 0
    done = 0
    for line in open(os.path.join(corpus, 'config', 'files.txt')):
        #if done >= 100: break
        fname = line.split()[2]
        fname = os.path.join(corpus, 'data', 'd2_tag', '01', 'files', fname)
        file_count += 1
        for line in open_input_file(fname):
            sentence_count += 1
            token_count += len(line.split())
        done += 1
    print corpus, file_count, sentence_count, token_count, "(%d seconds)" \
          % (time.time() - t1)
Esempio n. 8
0
 def _process_file(self, fname, fh):
     self.locations = {}
     infile = open_input_file(fname)
     for l in infile:
         parsed_line = parse_feats_line(l)
         year = parsed_line[1]
         term = parsed_line[2]
         feats = parsed_line[3]
         path = year + os.sep + os.path.splitext(parsed_line[0])[0]
         line = feats.get('doc_loc', '-1')
         key = path + "\t" + term
         if not self.locations.has_key(key):
             self.locations[key] = []
         self.locations[key].append(line)
     for key, lines in self.locations.items():
         path, term = key.split("\t", 1)
         fh.write("%s\t%s\t%s\t%s\n" %
                  (path, term, len(lines), ' '.join(lines)))
Esempio n. 9
0
def collect_counts(dataset, filelist):
    """Return a dictionary with for each term the number of documents it
    appeared in. This assumes that the dataset is a d3_phr_feats dataset."""
    counts = {}
    fnames = filename_generator(dataset.path, filelist)
    for fname in fnames:
        if verbose:
            print '[collect_counts]', fname
        # TODO: this is dangerous because it makes assumptions about the
        # directory structure, something similar was the case in step2 for at
        # least the docfeats generation
        year = os.path.basename(os.path.dirname(fname))
        doc_id = os.path.basename(fname)
        with open_input_file(fname) as fh:
            docfeats = generate_doc_feats(fh, doc_id, year)
            for term in docfeats.keys():
                counts[term] = counts.get(term, 0) + 1
    return counts
Esempio n. 10
0
def annotate_something(dirname, rconfig, filelist, chunks):
    """This is a stub method that explains a bit more on how to create
    annotation files. Includes scaffolding that shows how to pull information
    out of phrase feature and tag files. This is for cases when you use a list
    of files."""

    # Here is how you get the datasets
    dataset_tags = find_input_dataset(rconfig, 'd2_tag')
    dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats')

    # Check whether files from the file list are available
    check_file_availability(dataset_tags, filelist)
    check_file_availability(dataset_feats, filelist)

    # Next would typically be some way of writing down the information, the
    # following writes general information (command used, corpus directory as
    # well as git commit) and the list of files used. This also creates the
    # output directory.
    write_info(rconfig, dirname, filelist)

    # Now we can get the file names, loop over them, and extract the needed
    # information. The code below is some scaffolding if all you need is in one
    # dataset.
    fnames = filename_generator(dataset_feats.path, filelist)
    for fname in fnames:
        with open_input_file(fname) as fh:
            # extract data from the line, you may want to put it in some
            # temporary data structure
            for line in fh:
                pass

    # And this is what you do if you need information that is distributed over
    # the feature and tag files.
    tag_files = list(filename_generator(dataset_tags.path, filelist))
    feat_files = list(filename_generator(dataset_feats.path, filelist))
    for i in range(len(tag_files)):
        # the FileData object
        fd = FileData(tag_files[i], feat_files[i])
        # all term-related stuff lives in the Term object and its term_instances
        # variable, you can print to the annotation file(s) from here or first
        # build some intermediate data structure and then print the output later
        for term in fd.get_terms():
            term_obj = fd.get_term(term)
def _itrainer_create_dat_file(phr_feats_file, corpus, filelist):
    """Create the keyfeats.ta.dat file, which is a concatenation of all the
    files in filelist, but using only the first 100 terms in each file (because
    annotation does not go beyond those 100)."""
    print "[_itrainer_create_dat_file] creating", phr_feats_file
    print "[_itrainer_create_dat_file] from", corpus
    phr_feats_fh = codecs.open(phr_feats_file, 'w', encoding='utf-8')
    for line in open(filelist):
        (year, full_path, short_path) = line.split()
        # TODO: this is a hack, change this to use the filename generator and
        # the default_config and such
        fname = os.path.join(corpus, 'data/d3_phr_feats/01/files',
                             short_path)  # + '.gz')
        fh = open_input_file(fname)
        for line in fh:
            term_no = int(line.split()[0].split('_')[1])
            # no need to get too far into the file
            if term_no > 100: break
            phr_feats_fh.write(line)
    phr_feats_fh.close()
Esempio n. 12
0
    def make_utraining_file3(self, fnames, d_phr2label, verbose=False):
        """Create a file with training instances for Mallet. The list of phrase feature
        files to use is given in fnames and the annotated terms in d_phr2label.
        Also sets a couple of instance variables with statistics on labeled and
        unlabeled instances and types: stats_unlabled_count has a count of all
        instances (that is, term-document pairs) in the files in fnames without
        labels, stats_labeled_count has the number of all labeled instances, and
        stats_terms has a dictionary of terms to number of labeled instances per
        term.

        This method is based on a similarly named function in train.py.

        """

        mallet_file = self.mallet_config.train_mallet_file
        if verbose:
            print "[mallet.make_utraining_file3] writing to", mallet_file
            print "[mallet.make_utraining_file3] features used:", \
                sorted(self.d_features.keys())

        self.stats_labeled_count = 0
        self.stats_labeled_count_y = 0
        self.stats_labeled_count_n = 0
        self.stats_unlabeled_count = 0
        self.stats_terms = {}
        self.stats_terms_y = {}
        self.stats_terms_n = {}

        file_count = 0
        s_train = codecs.open(mallet_file, 'w', encoding='utf-8')
        for phr_feats_file in fnames:
            file_count += 1
            if verbose:
                print "%05d %s" % (file_count, phr_feats_file)
            year, doc_id = get_year_and_docid(phr_feats_file)
            with open_input_file(phr_feats_file) as fh:
                # this hard-wires the use of union train
                docfeats = generate_doc_feats(fh, doc_id, year)
                for term in sorted(docfeats.keys()):
                    feats = docfeats[term][2:]
                    feats = self.remove_filtered_feats(feats)
                    uid = "%s|%s|%s" % (year, doc_id, term.replace(' ','_'))
                    if d_phr2label.has_key(term):
                        label = d_phr2label.get(term)
                        if label == "":
                            print "[mallet.make_utraining_file3] " + \
                                "WARNING: term with null label: %s" % term
                        elif label in ('y', 'n'):
                            self.stats_terms[term] = self.stats_terms.get(term, 0) + 1
                            d = self.stats_terms_y if label == 'y' else self.stats_terms_n
                            d[term] = d.get(term, 0) + 1
                            # mallet line format: "uid label f1 f2 f3 ..."
                            mallet_line = " ".join([uid, label] + feats)
                            s_train.write(mallet_line + "\n")
                            self.stats_labeled_count += 1
                    else:
                        self.stats_unlabeled_count += 1

        if verbose:
            print "[make_utraining_file3] labeled instances: %i, unlabeled: %i, labeled types: %i" \
                % (self.stats_labeled_count, self.stats_unlabeled_count, len(self.stats_terms))
Esempio n. 13
0
def create_json_chunks_file(index_name, type_name, corpus, start, end, docs_per_bulk_load=500, section_filter_p=True, write_to_file_p=False):
    # reading from fuse pipeline data
    # writing to local tv corpus dir
    # for years from start to end

    # we'll need the name of the pipeline step to create the directory path to 
    # the phr_feats files.
    pipeline_step = "d3_phr_feats"

    # range parameters
    start_year = int(start)
    end_year = int(end)
    start_range = start_year
    end_range = end_year + 1

    # track the time in <year>.log
    log_file = pnames.tv_dir_year_file(corpus_root, corpus, "all", "log")
    s_log = open(log_file, "w")

    log_message = "Starting create_json_chunks_file for years: " + str(start) + " " + str(end)
    time = log.log_current_time(s_log, log_message, True)
    # remember the start_time for computing total time
    start_time = time


    # we'll bulk load all the data for a single year.
    # the argument to elasticsearch bulk is a list of dictionaries
    # alternating metadata and content.  We'll build this up in l_bulk_elements
    
    # The output is a list of lists, where each list contains the meta/content elements for n files
    l_colloc_bulk_lists = []
    l_colloc_bulk_elements = []

    d_chunk2prev_Npr = defaultdict(set)
    d_chunk2prev_V = defaultdict(set)
    d_chunk2doc = defaultdict(set)

    for year in range(start_range, end_range):

        # loop through files in file_list_file for the year
        filelist_file = pnames.fuse_filelist(fuse_corpus_root, corpus, year)
        s_file_list = open(filelist_file)

        # track the number of lines output to json file
        num_lines_output = 0
        json_file = pnames.tv_dir(corpus_root, corpus) + str(year) + ".chunks.json"
        s_json = codecs.open(json_file, "w", encoding='utf-8')

        file_count = 0
        for line in s_file_list:

            # if we have reached the file limit for a single bulk api call, add the sublist to l_colloc_bulk_lists 
            # and start a new sublist
            if (file_count % docs_per_bulk_load) == 0:
                # mod will be 0 for initial time through loop, so ignore this sublist
                if l_colloc_bulk_elements != []:
                    l_colloc_bulk_lists.append(l_colloc_bulk_elements)
                    l_colloc_bulk_elements = []

            file_count += 1
            
            line = line.strip("\n")
            # get the date/filename portion of path
            l_line_fields = line.split("\t")
            # get the rest of the file path (publication_year/id.xml)
            pub_year_and_file = l_line_fields[2]
            # extract patent_id from the filename (e.g. US5787464A from 1998/020/US5787464A.xml)
            patent_id = os.path.splitext(os.path.basename(pub_year_and_file))[0]
            phr_feats_file = pnames.fuse_phr_feats_file(fuse_corpus_root, corpus, pipeline_step, year, pub_year_and_file)

            #print "[invention]opening phr_feats: %s, id: %s" % (phr_feats_file, patent_id)
            #sys.exit()

            #s_phr_feats = codecs.open(phr_feats_file, encoding='utf-8')
            # handle compressed or uncompressed files
            s_phr_feats = open_input_file(phr_feats_file)

            # we need to combine all the chunks from a single sentence into one output entry
            l_chunks = []
            # assume the first sent_no in a document will always be 0
            last_sent_no = "0"
            for line in s_phr_feats:
                # todo make into regex ///
                if not(section_filter_p) or line.find("TITLE") > 0 or line.find("ABSTRACT") > 0 or line.find("SUMMARY") > 0:
                    # then process the line
                    l_data = line.split("\t")
                    # save chunk as phrase with "_" instead of blank connecting tokens
                    chunk = l_data[2].replace(" ", "_")
                    # extract the value field from the doc_loc feature to get the sent_no
                    sent_no = p_doc_loc.search(line).group(1)

                    # populate chunk dictionaries
                    d_chunk2docs[chunk].add(patent_id)
                    prev_V = p_prev_V.search(line)
                    if prev_V != None:
                        d_chunk2prev_V[chunk].add(prev_V)
                    prev_Npr = p_prev_Npr.search(line)
                    if prev_Npr != None:
                        d_chunk2prev_Npr[chunk].add(prev_Npr)

                    if sent_no == last_sent_no:
                        l_chunks.append(chunk)
                    else:
                        # we are done with the sentence, so write out the chunk list
                        json_string = format_colloc_chunks2json(patent_id, year, last_sent_no, l_chunks)
                        uid = "_".join([patent_id, last_sent_no])
                        
                        #print "last_sent_no: %s, chunks: %s, json: %s" % (last_sent_no, l_chunks, json_string)
                        # note the above print gives an error for non-asci chars.
                        if write_to_file_p:
                            # make a json file with all the data to be loaded into elasticsearch
                            s_json.write("%s\n" % json_string)
                        l_colloc_bulk_elements.append(format_d_action(index_name, type_name, uid))
                        l_colloc_bulk_elements.append(format_colloc_d_content(patent_id, year, last_sent_no, l_chunks))

                        # keep the current chunk
                        l_chunks = [chunk]
                        last_sent_no = sent_no
                        num_lines_output += 1

            # output the last line
            
            json_string = format_colloc_chunks2json(patent_id, year, last_sent_no, l_chunks)
            #print "last_sent_no: %s, chunks: %s, json: %s" % (last_sent_no, l_chunks, json_string)
            s_json.write("%s\n" % json_string)
            l_colloc_bulk_elements.append(format_d_action(index_name, type_name, uid))
            l_colloc_bulk_elements.append(format_colloc_d_content(patent_id, year, last_sent_no, l_chunks))
            num_lines_output += 1

            #"""
            # stop after n files for debugging
            if file_count > 3000:
                break
            #"""

            s_phr_feats.close()            

        # add the remaining elements to l_colloc_bulk_lists
        l_colloc_bulk_lists.append(l_colloc_bulk_elements)

        print "[docs.py]%i lines from %i files written to %s" % (num_lines_output, file_count, json_file)
        s_json.close()
    s_log.close()
    s_file_list.close()

    """
    # unfinished section to create chunk index
    # prepare data for chunk index
    for chunk in d_chunk2docs.keys():

        l_docs = d_chunk2docs[chunk]
        l_prev_V = d_chunk2prev_V[chunk]
        l_prev_Npr = d_chunk2prev_Npr[chunk]

    """


    # todo: eventually, return two lists
    return(l_colloc_bulk_lists)
Esempio n. 14
0
    def process_doc(self, filter_p=True, chunker_rules='en'):
        """Process the doc, creating all potential technology chunks and
        calculating their features."""

        debug_p = False
        if debug_p:
            print "[process_doc] filter_p: %s, writing to %s" % \
                  (filter_p, self.output)
        s_input = open_input_file(self.input)
        s_output = open_output_file(self.output, compress=self.compress)
        section = "FH_NONE"  # default section if document has no section header lines
        self.d_field[section] = []

        sent_no_in_section = 0
        for line in s_input:
            line = line.strip("\n")
            if debug_p:
                print "[process_doc] line: %s" % line

            if line[0:3] == "FH_":
                # we are at a section header; note we have to strip off both
                # final ':' and whitespace, since in some cases eg. Chinese
                # segmentation, the colon will be separated from the header term
                # by a blank.
                section = line.split("_")[1].rstrip(": ")
                self.d_field[section] = []
                sent_no_in_section = 0

            else:
                # process the sentence, the line is a list of token_tag pairs
                if section == "TITLE" or section == "ABSTRACT":
                    self.l_lc_title_noun.extend(lc_nouns(line))

                # call the appropriate Sentence subclass based on the language
                sent_args = [
                    self.next_sent_id, section, sent_no_in_section, line,
                    self.chunk_schema
                ]
                sent = sentence.get_sentence_for_lang(self.lang, sent_args)
                # get context info
                i = 0
                for chunk in sent.chunk_iter():
                    if chunk.label == "tech":
                        # index of chunk start in sentence => ci
                        ci = chunk.chunk_start
                        hsent = sent.highlight_chunk(i)
                        mallet_feature_list = get_features(sent, ci)
                        mallet_feature_list.sort()
                        uid = os.path.basename(self.input) + "_" + str(
                            self.next_chunk_id)
                        metadata_list = [uid, self.year, chunk.phrase.lower()]
                        if debug_p:
                            print "index: %i, start: %i, end: %i, sentence: %s" % \
                                (i, chunk.chunk_start, chunk.chunk_end, sent.sentence)
                        if add_chunk_data(self, chunk, section, filter_p):
                            add_line_to_phr_feats(metadata_list,
                                                  mallet_feature_list,
                                                  s_output)
                        chunk.sid = self.next_sent_id
                        self.d_chunk[self.next_chunk_id] = chunk
                        sent.chunks.append(chunk)
                        self.next_chunk_id += 1
                    i = chunk.chunk_end

                # keep track of the location of this sentence within the section
                sent_no_in_section += 1
                self.d_field[section].append(sent)
                self.d_sent[self.next_sent_id] = sent
                self.next_sent_id += 1

        s_input.close()
        s_output.close()
Esempio n. 15
0
    #print line,
    #print len(features), features
    (year, fname, term) = id.split('|', 2)
    #print label, term
    return label, term, features


mallet_file = sys.argv[1]
info_file = mallet_file + '.stats.txt'

pos_terms = {}
neg_terms = {}
features = {}
featvals = {}

with open_input_file(mallet_file) as fh:
    count = 0
    for line in fh:
        count += 1
        #if count > 10000: break
        if count % 100000 == 0: print count
        label, term, feats = parse_mallet_line(line)
        if label == 'y':
            pos_terms[term] = pos_terms.get(term, 0) + 1
        elif label == 'n':
            neg_terms[term] = neg_terms.get(term, 0) + 1
        for featval in feats:
            feat, val = featval.split('=', 1)
            #if feat == '234_shore': print line
            features[feat] = features.get(feat, 0) + 1
            if not featvals.has_key(feat):
Esempio n. 16
0
def gen_bulk_lists(index_name, type_name, domain, corpus, start, end, lines_per_bulk_load=100, section_filter_p=True, write_to_file_p=False, max_lines=0):
    # reading from fuse pipeline data
    # writing to local tv corpus dir
    # for years from start to end

    # we'll need the name of the pipeline step to create the directory path to 
    # the phr_feats files.
    pipeline_step = "d3_phr_feats"

    ###print "corpus_root: %s, corpus: %s"  % (corpus_root, str(corpus))

    # range parameters
    start_year = int(start)
    end_year = int(end)
    start_range = start_year
    end_range = end_year + 1

    # track the time in <year>.log
    log_file = pnames.tv_dir_year_file(corpus_root, corpus, "all", "log")
    s_log = open(log_file, "w")

    log_message = "[es_np.py gen_bulk_lists]Starting make_bulk_lists for years: " + str(start) + " " + str(end)
    time = log.log_current_time(s_log, log_message, True)
    # remember the start_time for computing total time
    start_time = time

    # we'll bulk load all the data for a single year.
    # the argument to elasticsearch bulk is a list of dictionaries
    # alternating metadata and content.  We'll build this up in l_bulk_elements
    
    # The output is a list of flattened paired elements, where each list contains the meta/content elements for n lines
    #l_bulk_lists = []
    l_bulk_elements = []

    for year in range(start_range, end_range):

        # loop through files in file_list_file for the year
        filelist_file = pnames.fuse_filelist(fuse_corpus_root, corpus, year)
        s_file_list = open(filelist_file)

        # track the number of lines output to json file
        num_lines_output = 0
        json_file = pnames.tv_dir(corpus_root, corpus) + str(year) + ".chunks.json"
        s_json = codecs.open(json_file, "w", encoding='utf-8')

        file_count = 0
        ###pdb.set_trace()

        for line in s_file_list:
            ###pdb.set_trace()


            file_count += 1
            
            line = line.strip("\n")
            # get the date/filename portion of path
            l_line_fields = line.split("\t")
            # get the rest of the file path (publication_year/id.xml)
            pub_year_and_file = l_line_fields[2]
            # extract patent_id from the filename (e.g. US5787464A from 1998/020/US5787464A.xml)
            patent_id = os.path.splitext(os.path.basename(pub_year_and_file))[0]

            # create a "doc" type entry to be bulk loaded.  This will be the parent of both "sent"
            # and "np" records in the index
            
            l_bulk_elements.append(format_d_action(index_name, "doc", patent_id))
            l_bulk_elements.append(format_doc_d_content(domain, year, patent_id))

            # lists to capture each sent's sheads and sterms
            sheads = []
            sterms = []
            # loc is the sentence number in the document, starting at 0
            current_sent = 0
            # Assume the initial section will be TITLE
            current_section = "TITLE"

            num_lines_output += 1

            # end creating doc index entry

            phr_feats_file = pnames.fuse_phr_feats_file(fuse_corpus_root, corpus, pipeline_step, year, pub_year_and_file)

            #print "[invention]opening phr_feats: %s, id: %s" % (phr_feats_file, patent_id)
            #sys.exit()

            #s_phr_feats = codecs.open(phr_feats_file, encoding='utf-8')
            # handle compressed or uncompressed files
            s_phr_feats = open_input_file(phr_feats_file)

            for line in s_phr_feats:

                # if we have reached the line limit for a single bulk api call, add the sublist to l_bulk_lists 
                # and start a new sublist
                if (num_lines_output % lines_per_bulk_load) == 0:
                    ###print "num_lines_output: %i" % num_lines_output
                    # mod will be 0 for initial time through loop, so ignore this sublist
                    if l_bulk_elements != []:
                        yield l_bulk_elements
                        l_bulk_elements = []

                # todo make into regex ///
                # Note that DESC was added 3/38/15, so indices created earlier do not contain that section.
                if not(section_filter_p) or line.find("TITLE") > 0 or line.find("ABSTRACT") > 0 or line.find("SUMMARY") > 0 or line.find("DESC") > 0:
                    # then process the line
                    l_data = line.split("\t")
                    # chunk is phrase with  blanks connecting tokens
                    uid = l_data[0]  # uid is doc_id + phrase number
                    phr = l_data[2]  # phrase with whitespace separating words

                    # extract the value field from the doc_loc feature to get the loc (sentence number)
                    loc = p_doc_loc.search(line).group(1)
                    # We will store it as an integer in es
                    loc = int(loc)

                    section = p_section.search(line).group(1)
                    pos = p_pos.search(line).group(1)
                    pos = pos.replace("_", " ")

                    # populate chunk dictionaries
                    prev_V = p_prev_V.search(line)
                    if prev_V != None:
                        # extract the matched string (group 0 is the entire match, while 
                        # group 1 is the first parenthesized subexpression in the pattern)
                        prev_V = prev_V.group(1)

                    prev_Npr = p_prev_Npr.search(line)
                    if prev_Npr != None:
                        prev_Npr = prev_Npr.group(1)

                    prev_J = p_prev_J.search(line)
                    if prev_J != None:
                        # extract the matched string (group 0 is the entire match, while 
                        # group 1 is the first parenthesized subexpression in the pattern)
                        prev_J = prev_J.group(1)


                    ###pdb.set_trace()
                    l_bulk_elements.append(format_d_action(index_name, "np", uid, parent_id=patent_id))
                    d_field_content = format_np_d_content(phr, prev_Npr, prev_V, prev_J, domain, year, patent_id, loc, section, pos)
                    l_bulk_elements.append(d_field_content)

                    # We will use data in d_field_content to avoid recomputing fields for sent.
                    shead = d_field_content["chead"]
                    sterm = d_field_content["cterm"]
                    # section can change whenever loc changes
                    section = d_field_content["section"]

                    # if loc != current_sent, we need to store a sent record for the current_loc
                    if loc != current_sent:
                        # store the record and start populating a new one
                        sent_id = patent_id + "_" + str(current_sent)
                        l_bulk_elements.append(format_d_action(index_name, "sent", sent_id, parent_id=patent_id))
                        l_sent_dict = format_sent_d_content(domain, year, patent_id, current_section, current_sent, sheads, sterms)
                        l_bulk_elements.append(l_sent_dict)

                        ###print "Adding sent: %s, sent_dict: %s" % (sent_id, l_sent_dict)
                        # re-initialize the sheads and sterms lists
                        sheads = [ shead ]
                        sterms = [ sterm ]
                        # increment count for "sent" output
                        num_lines_output += 1
                        # update the current_sent and section
                        current_sent = loc
                        current_section = section

                    else:
                        # we are still in the same sentence.
                        # add the latest term/head to the sent fields for current_sent
                        sheads.append(shead)
                        sterms.append(sterm)

                    # increment count for "np" output
                    num_lines_output += 1
       
                # stop after max_lines files for debugging
                ###print "num_lines_output: %i, max_lines: %i" % (num_lines_output, max_lines)
                if (max_lines != 0) and num_lines_output > max_lines: 
                    break
            # break out of file loop as well
            if (max_lines != 0) and num_lines_output > max_lines: 
                break

            # We need to store a sent record for the last sentence in last file (= current_sent)
            sent_id = patent_id + "_" + str(current_sent)
            ###print "[gen_bulk_list]last sent_id: %s, sheads: %s, sterms: %s\n" % (sent_id, sheads, sterms)
            l_bulk_elements.append(format_d_action(index_name, "sent", sent_id, parent_id=patent_id))
            l_bulk_elements.append(format_sent_d_content(domain, year, patent_id, current_section, current_sent, sheads, sterms))
            num_lines_output += 1

            s_phr_feats.close()            

        s_json.close()

    log_message = "[es_np_index.py]Completed make_bulk_lists for years: " + str(start) + " " + str(end) + ". Number of lines: " + str(num_lines_output)
    time = log.log_current_time(s_log, log_message, True)

    s_log.close()
    s_file_list.close()

    # yield the last remaining l_bulk_elements

    print "[gen_bulk_lists]%i lines from %i files written to index %s" % (num_lines_output, file_count, index_name)
    yield(l_bulk_elements)