def write_constraints_test(): flags.InitFlags() # read in vocab vocab = getVocab(flags.vocab) print "Read vocabulary" # read in constraints ml_cons, cl_cons = readConstraints(flags.constraints) # Merge constraints if flags.merge_constraints: ml_updated, cl_updated = mergeAllConstraints(ml_cons, cl_cons) #else: # ml_updated = map(lambda x: ['ML_', set(x)], ml) # cl_updated = map(lambda x: ['CL_', set(x)], cl) # Constraints counts constraints_count = getConstraintCount(ml_updated, cl_updated) # get constraint vocab constraint_words = constraints_count.keys() # Check constraints check = list(x for x in constraint_words if x not in vocab) assert not check, "Constraints were not in vocab: %s" % ", ".join(check) # Remained word list remained_words = list(x for x in vocab if not (x in constraint_words))
term = ii.split("\t")[1].strip() word = voc.terms.add() word.id = line_num word.original = term word.ascii = term.encode("ascii", "replace") word.frequency = 0 word.stop_word = False lookup[line_num] = term line_num += 1 return c, lookup if __name__ == "__main__": flags.InitFlags() doc_num = 0 index_num = -1 record_num = 0 # Create the doc filter if we need it doc_filter = {} if flags.doc_filter: for ii in open(flags.doc_filter): if ii.startswith("Doc#"): # Ignore header continue fields = ii.split("\t") doc_filter[int(fields[0])] = fields[-1].strip() o_state = open("%s.topic_assignments" % flags.state_output, 'w')
def write_constraints_old(): flags.InitFlags() # read in vocab vocab = getVocab(flags.vocab) print "Read vocabulary" # read in constraints ml_cons, cl_cons = readConstraints(flags.constraints) # Merge constraints if flags.merge_constraints: ml_updated, cl_updated = mergeAllConstraints(ml_cons, cl_cons) #else: # ml_updated = map(lambda x: ['ML_', set(x)], ml) # cl_updated = map(lambda x: ['CL_', set(x)], cl) # Constraints counts constraints_count = getConstraintCount(ml_updated, cl_updated) # get constraint vocab constraint_words = constraints_count.keys() # Check constraints check = list(x for x in constraint_words if x not in vocab) assert not check, "Constraints were not in vocab: %s" % ", ".join(check) # Remained word list remained_words = list(x for x in vocab if not (x in constraint_words)) ######################################################### print flags.wnname wnname = flags.wnname o = OntologyWriter(wnname) if len(remained_words) > 0: # +1 is for one synset for the unconstratined words num_children = len(ml_updated) + len(cl_updated) + 1 else: num_children = len(ml_updated) + len(cl_updated) o.AddSynset(0, "ROOT", xrange(1, num_children + 1), []) allocated_index = num_children # Add ML constraints rcIndex = 0 ml_count = 0 for cons in ml_updated: rcIndex += 1 ml_count += 1 name = cons[0] + "%i_%s" % (ml_count, ":".join(cons[1])[:20]) wordset = [] for word in cons[1]: lang = ENGLISH num = 1.0 / constraints_count[word] wordset.append((lang, word, num)) o.AddSynset(rcIndex, name, [], wordset) print("Added %i ML constraint nodes" % (ml_count)) # Add CL constraints cl_count = 0 nl_count = 0 nl_in_count = 0 for cons in cl_updated: rcIndex += 1 cl_count += 1 name = cons[0] + "%i" % (cl_count) children_count = len(cons[1]) start = allocated_index + 1 o.AddSynset(rcIndex, name, xrange(start, start + children_count), []) clcIndex = allocated_index allocated_index += children_count for clique in cons[1]: if re.search('^NL_IN_$', clique[0]): # four levels tree nl_in_count += 1 name = clique[0] + "%i" % (nl_in_count) nl_in_child_count = len(clique[1]) clcIndex += 1 start = allocated_index + 1 o.AddSynset(clcIndex, name, \ xrange(start, start + nl_in_child_count), []) nl_in_cIndex = allocated_index allocated_index += nl_in_child_count for in_clique in clique[1]: if re.search('^ML_$', in_clique[0]): ml_count += 1 name = in_clique[0] + "%i_%s" % \ (ml_count, ":".join(in_clique[1])[:20]) else: # re.search('^NL_$', clique[0]): nl_count += 1 name = in_clique[0] + "%i_%s" % \ (nl_count, ":".join(in_clique[1])[:20]) wordset = [] for word in in_clique[1]: lang = ENGLISH num = 1.0 / constraints_count[word] wordset.append((lang, word, num)) nl_in_cIndex += 1 o.AddSynset(nl_in_cIndex, name, [], wordset) else: # three levels tree if re.search('^ML_$', clique[0]): ml_count += 1 name = clique[0] + "%i_%s" % \ (ml_count, ":".join(clique[1])[:20]) else: # re.search('^NL_$', clique[0]): nl_count += 1 name = clique[0] + "%i_%s" % \ (nl_count, ":".join(clique[1])[:20]) wordset = [] for word in clique[1]: lang = ENGLISH num = 1.0 / constraints_count[word] wordset.append((lang, word, num)) clcIndex += 1 o.AddSynset(clcIndex, name, [], wordset) # Unused words if len(remained_words) > 0: wordset = [] for word in remained_words: lang = ENGLISH num = 1 wordset.append((lang, word, num)) name = "NL_REMAINED_" rcIndex += 1 o.AddSynset(rcIndex, name, [], wordset) print("Added %i total nodes for vocab" % rcIndex) assert rcIndex == num_children, "Mismatch of children %i %i" \ % (rcIndex, num_children) # Add root o.Finalize()
def write_constraints(): flags.InitFlags() # read in vocab vocab = getVocab(flags.vocab) print "Read vocabulary" # read in constraints ml_cons, cl_cons = readConstraints(flags.constraints) # Merge constraints if flags.merge_constraints: ml_updated, cl_updated = mergeAllConstraints(ml_cons, cl_cons) #else: # ml_updated = map(lambda x: ['ML_', set(x)], ml) # cl_updated = map(lambda x: ['CL_', set(x)], cl) print ml_updated print cl_updated # Constraints counts constraints_count = getConstraintCount(ml_updated, cl_updated) # get constraint vocab constraint_words = constraints_count.keys() # Check constraints check = list(x for x in constraint_words if x not in vocab) assert not check, "Constraints were not in vocab: %s" % ", ".join(check) # Remained word list remained_words = list(x for x in vocab if not (x in constraint_words)) ######################################################### print flags.wnname wnname = flags.wnname o = OntologyWriter(wnname) if len(remained_words) > 0: # +1 is for one synset for the unconstratined words num_children = len(ml_updated) + len(cl_updated) + 1 else: num_children = len(ml_updated) + len(cl_updated) o.AddSynset(0, "ROOT", xrange(1, num_children + 1), []) allocated_index = num_children rootchild_index = 0 leaf_count = 0 # Add ML constraints for cons in ml_updated: [rootchild_index, leaf_count, allocated_index] = write_internal_nodes\ (cons, rootchild_index, leaf_count, allocated_index, o, constraints_count) # Add CL constraints for cons in cl_updated: [rootchild_index, leaf_count, allocated_index] = write_internal_nodes\ (cons, rootchild_index, leaf_count, allocated_index, o, constraints_count) # Add Unused words if len(remained_words) > 0: remained = ["NL_REMAINED_", remained_words] [rootchild_index, leaf_count, allocated_index] = write_leaf\ (remained, rootchild_index, leaf_count, allocated_index, o, constraints_count) print("Added %i total nodes for vocab" % rootchild_index) assert rootchild_index == num_children, "Mismatch of children %i %i" \ % (rootchild_index, num_children) print "Number of leaves:", leaf_count # Add root o.Finalize()