def getMappingDicts_reGen(corpusdir, mapsdir, cons): # check the old constraint.dict exists or not cons_file = corpusdir + "/constraint.set" if (not os.path.exists(cons_file)): # Regenerate (word_wid_dic, wid_did_dic, did_doc_dic) = \ getNewMappingDicts(corpusdir, mapsdir) else: # check whether the old constraint is the same as consdict old_cons = read_pickle(cons_file) if checkSame(cons, old_cons): # check the mapping dicts exist or not word_wid = mapsdir + "/word_wid.dict" wid_did = mapsdir + "/wid_did.dict" did_doc = mapsdir + "/did_doc.dict" if (os.path.exists(word_wid) and os.path.exists(wid_did) \ and os.path.exists(did_doc)): word_wid_dic = read_pickle(word_wid) wid_did_dic = read_pickle(wid_did) did_doc_dic = read_pickle(did_doc) else: (word_wid_dic, wid_did_dic, did_doc_dic) = \ getNewMappingDicts(corpusdir, mapsdir) else: (word_wid_dic, wid_did_dic, did_doc_dic) = \ getNewMappingDicts(corpusdir, mapsdir) write_pickle(cons, cons_file) return (word_wid_dic, wid_did_dic, did_doc_dic)
def getNewAddedCons(corpusdir, cons_set, cons_list): # check the old constraint.list exists or not cons_file = corpusdir + "/constraint.set" if (not os.path.exists(cons_file)): cons_added_set = cons_set else: cons_old_set = read_pickle(cons_file) cons_added_set = cons_set.difference(cons_old_set) # save the new cons set to file write_pickle(cons_set, cons_file) cons_file = corpusdir + "/constraint.list" write_pickle(cons_list, cons_file) return cons_added_set
def build_index(corpus_dir, maps_file): index = defaultdict() num_docs = 0 read_voc = 0 for ii in glob("%s/*.index" % corpus_dir): inputfile = open(ii, 'rb') protocorpus = Corpus() protocorpus.ParseFromString(inputfile.read()) if read_voc == 0: # assume that vocab in each index file is the same, so # we just need to load it once and initialize index once # or it is too slow read_voc = 1 vocab = {} for lang in protocorpus.tokens: for ii in lang.terms: vocab[ii.id] = ii.original index[ii.original] = defaultdict(set) for dd in protocorpus.doc_filenames: num_docs += 1 docfile = open("%s/%s" % (corpus_dir, dd), 'rb') doc = Document() doc.ParseFromString(docfile.read()) if num_docs % 250 == 0: print doc.id, doc.title, len(index), " vocab seen" word_index = 0 for jj in doc.sentences: for kk in jj.words: w = vocab[kk.token] index[w][doc.id].add(word_index) word_index += 1 if flags.doc_limit > 0 and num_docs > flags.doc_limit: break write_pickle(index, maps_file) return index
def getNewMappingDicts(corpusdir, mapsdir): # Mapping documents to word corpusLocation = corpusdir + "/model_topic_assign/doc_voc.index" inputfile = open(corpusLocation, 'rb') protocorpus = Corpus() protocorpus.ParseFromString(inputfile.read()) voc_tokens = protocorpus.tokens word_wid_dic = dict() wid_word_dic = dict() wid_did_dic = dict() did_doc_dic = dict() for i in range(0, len(voc_tokens)): terms = voc_tokens[i].terms for j in range(0, len(terms)): w = terms[j].original id = terms[j].id word_wid_dic[str(w)] = id wid_word_dic[id] = str(w) wid_did_dic[id] = dict() docnames = protocorpus.doc_filenames for i in range(0, len(docnames)): docname = docnames[i] doclocation = corpusdir + "/model_topic_assign/" + docname docfile = open(doclocation, 'rb') doc = Document() doc.ParseFromString(docfile.read()) did = i did_doc_dic[did] = docname sents = doc.sentences word_index = 0 for j in range(0, len(sents)): words = sents[j].words for k in range(0, len(words)): w = words[k].token #wid_did_dic[w].add(did) if did not in wid_did_dic[w]: wid_did_dic[w][did] = set() wid_did_dic[w][did].add(word_index) word_index += 1 # Save mapping dicts word_wid = mapsdir + "/word_wid.dict" wid_did = mapsdir + "/wid_did.dict" did_doc = mapsdir + "/did_doc.dict" write_pickle(word_wid_dic, word_wid) write_pickle(wid_did_dic, wid_did) write_pickle(did_doc_dic, did_doc) return (word_wid_dic, wid_did_dic, did_doc_dic)