def getMappingDicts_reGen(corpusdir, mapsdir, cons):
    # check the old constraint.dict exists or not
    cons_file = corpusdir + "/constraint.set"
    if (not os.path.exists(cons_file)):
        # Regenerate
        (word_wid_dic, wid_did_dic, did_doc_dic) = \
            getNewMappingDicts(corpusdir, mapsdir)
    else:
        # check whether the old constraint is the same as consdict
        old_cons = read_pickle(cons_file)
        if checkSame(cons, old_cons):
            # check the mapping dicts exist or not
            word_wid = mapsdir + "/word_wid.dict"
            wid_did = mapsdir + "/wid_did.dict"
            did_doc = mapsdir + "/did_doc.dict"
      
            if (os.path.exists(word_wid) and os.path.exists(wid_did) \
                                         and os.path.exists(did_doc)):
                word_wid_dic = read_pickle(word_wid)
                wid_did_dic = read_pickle(wid_did)
                did_doc_dic = read_pickle(did_doc)
            else:
                (word_wid_dic, wid_did_dic, did_doc_dic) = \
                    getNewMappingDicts(corpusdir, mapsdir)
        else:
            (word_wid_dic, wid_did_dic, did_doc_dic) = \
                getNewMappingDicts(corpusdir, mapsdir)
    write_pickle(cons, cons_file)
    return (word_wid_dic, wid_did_dic, did_doc_dic)
def getNewAddedCons(corpusdir, cons_set, cons_list):
    # check the old constraint.list exists or not
    cons_file = corpusdir + "/constraint.set"
    if (not os.path.exists(cons_file)):
        cons_added_set = cons_set
    else:
        cons_old_set = read_pickle(cons_file)
        cons_added_set = cons_set.difference(cons_old_set)
    # save the new cons set to file
    write_pickle(cons_set, cons_file)
    cons_file = corpusdir + "/constraint.list"
    write_pickle(cons_list, cons_file)
    return cons_added_set
def build_index(corpus_dir, maps_file):
  index = defaultdict()

  num_docs = 0
  read_voc = 0
  for ii in glob("%s/*.index" % corpus_dir):
    inputfile = open(ii, 'rb')
    protocorpus = Corpus()
    protocorpus.ParseFromString(inputfile.read())

    if read_voc == 0:
      # assume that vocab in each index file is the same, so
      # we just need to load it once and initialize index once
      # or it is too slow
      read_voc = 1
      vocab = {}
      for lang in protocorpus.tokens:
        for ii in lang.terms:
          vocab[ii.id] = ii.original
          index[ii.original] = defaultdict(set)

    for dd in protocorpus.doc_filenames:
      num_docs += 1

      docfile = open("%s/%s" % (corpus_dir, dd), 'rb')
      doc = Document()
      doc.ParseFromString(docfile.read())

      if num_docs % 250 == 0:
        print doc.id, doc.title, len(index), " vocab seen"

      word_index = 0
      for jj in doc.sentences:
        for kk in jj.words:
          w = vocab[kk.token]
          index[w][doc.id].add(word_index)
          word_index += 1

    if flags.doc_limit > 0 and num_docs > flags.doc_limit:
      break

  write_pickle(index, maps_file)

  return index
def getNewMappingDicts(corpusdir, mapsdir):

    # Mapping documents to word
    corpusLocation = corpusdir + "/model_topic_assign/doc_voc.index"
  
    inputfile = open(corpusLocation, 'rb')
    protocorpus = Corpus()
    protocorpus.ParseFromString(inputfile.read())
  
    voc_tokens = protocorpus.tokens
  
    word_wid_dic = dict()
    wid_word_dic = dict()
    wid_did_dic = dict()
    did_doc_dic = dict()
  
    for i in range(0, len(voc_tokens)):
        terms = voc_tokens[i].terms
        for j in range(0, len(terms)):
            w = terms[j].original
            id = terms[j].id
            word_wid_dic[str(w)] = id
            wid_word_dic[id] = str(w)
            wid_did_dic[id] = dict()
      
    docnames = protocorpus.doc_filenames
  
    for i in range(0, len(docnames)):
        docname = docnames[i]
        doclocation = corpusdir + "/model_topic_assign/" + docname
        docfile = open(doclocation, 'rb')
        doc = Document()
        doc.ParseFromString(docfile.read())
    
        did = i
        did_doc_dic[did] = docname
    
        sents = doc.sentences
        word_index = 0
    
        for j in range(0, len(sents)):
            words = sents[j].words
            for k in range(0, len(words)):
                w = words[k].token
                #wid_did_dic[w].add(did)
                if did not in wid_did_dic[w]:
                    wid_did_dic[w][did] = set()
                wid_did_dic[w][did].add(word_index)
                word_index += 1
        
    # Save mapping dicts
    word_wid = mapsdir + "/word_wid.dict"
    wid_did = mapsdir + "/wid_did.dict"
    did_doc = mapsdir + "/did_doc.dict"
  
    write_pickle(word_wid_dic, word_wid)
    write_pickle(wid_did_dic, wid_did)
    write_pickle(did_doc_dic, did_doc)
  
    return (word_wid_dic, wid_did_dic, did_doc_dic)